def test_exceptions(self, tmpdir): with pytest.raises(EmptyFileError): casanova.enricher(StringIO(''), StringIO('')) output_path = str(tmpdir.join('./wrong_resumer.csv')) with pytest.raises(TypeError): resumer = ThreadSafeResumer(output_path) with open('./test/resources/people.csv') as f, resumer: casanova.enricher(f, resumer)
def test_resumable(self, tmpdir): log = defaultdict(list) def listener(name, row): log[name].append(list(row)) output_path = str(tmpdir.join('./enriched_resumable.csv')) resumer = RowCountResumer(output_path, listener=listener) with open('./test/resources/people.csv') as f, resumer: enricher = casanova.enricher( f, resumer, add=('x2',), keep=('name',) ) row = next(iter(enricher)) enricher.writerow(row, [2]) assert collect_csv(output_path) == [ ['name', 'x2'], ['John', '2'] ] with open('./test/resources/people.csv') as f, resumer: enricher = casanova.enricher( f, resumer, add=('x2',), keep=('name',) ) for i, row in enumerate(enricher): enricher.writerow(row, [(i + 2) * 2]) assert collect_csv(output_path) == [ ['name', 'x2'], ['John', '2'], ['Mary', '4'], ['Julia', '6'] ] assert log == { 'output.row': [['John', '2']], 'input.row': [['John', 'Matthews']] }
def url_parse_action(namespace, output_file): enricher = casanova.enricher( namespace.file, output_file, add=REPORT_HEADERS, keep=namespace.select ) loading_bar = tqdm( desc='Parsing', dynamic_ncols=True, unit=' lines', ) for row, url in enricher.cells(namespace.column, with_rows=True): loading_bar.update() url = url.strip() youtube_url = parse_youtube_url(url) if not youtube_url: enricher.writerow(row) continue enricher.writerow( row, [YOUTUBE_TYPES.get(type(youtube_url)), youtube_url.id, getattr(youtube_url, 'name', None)] )
def captions_action(namespace, output_file): # Handling output single_video = namespace.file is sys.stdin and sys.stdin.isatty() if single_video: edit_namespace_with_csv_io(namespace, 'video') enricher = casanova.enricher(namespace.file, output_file, add=YOUTUBE_CAPTIONS_CSV_HEADERS, keep=namespace.select) loading_bar = LoadingBar('Retrieving captions', unit='video') for row, video in enricher.cells(namespace.column, with_rows=True): result = get_video_captions(video, langs=namespace.lang) loading_bar.update() if result is None: continue track, lines = result prefix = [track.lang, '1' if track.generated else ''] for line in lines: enricher.writerow(row, prefix + list(line)) loading_bar.close()
def comments_action(namespace, output_file): # Handling output single_video = namespace.file is sys.stdin and sys.stdin.isatty() if single_video: edit_namespace_with_csv_io(namespace, 'video') enricher = casanova.enricher(namespace.file, output_file, add=YOUTUBE_COMMENT_CSV_HEADERS, keep=namespace.select) loading_bar = LoadingBar('Retrieving comments', unit='comment', stats={'videos': 0}) def before_sleep_until_midnight(seconds): loading_bar.print( 'API limits reached. Will now wait until midnight Pacific time!') client = YouTubeAPIClient( namespace.key, before_sleep_until_midnight=before_sleep_until_midnight) for row, video in enricher.cells(namespace.column, with_rows=True): generator = client.comments(video) for comment in generator: loading_bar.update() enricher.writerow(row, comment.as_csv_row()) loading_bar.inc('videos') loading_bar.close()
def videos_action(namespace, output_file): enricher = casanova.enricher(namespace.file, output_file, add=YOUTUBE_VIDEO_CSV_HEADERS, keep=namespace.select) loading_bar = LoadingBar('Retrieving videos', unit='video', total=namespace.total) def before_sleep_until_midnight(seconds): loading_bar.print( 'API limits reached. Will now wait until midnight Pacific time!') client = YouTubeAPIClient( namespace.key, before_sleep_until_midnight=before_sleep_until_midnight) iterator = enricher.cells(namespace.column, with_rows=True) for (row, _), video in client.videos(iterator, key=itemgetter(1)): loading_bar.update() enricher.writerow(row, video.as_csv_row() if video else None) loading_bar.close()
def crowdtangle_posts_by_id_action(namespace, output_file): client = CrowdTangleClient(namespace.token, rate_limit=namespace.rate_limit) already_done = 0 def listener(event, row): nonlocal already_done if event == 'resume.input': already_done += 1 enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=CROWDTANGLE_POST_CSV_HEADERS, resumable=namespace.resume, listener=listener ) loading_bar = tqdm( desc='Retrieving posts', dynamic_ncols=True, total=namespace.total, unit=' posts' ) loading_bar.update(already_done) loading_bar_context = LoadingBarContext(loading_bar) try: for row, url in enricher.cells(namespace.column, with_rows=True): with loading_bar_context: url = url.strip() if not url: enricher.writerow(row) continue url = ensure_protocol(url) if not is_facebook_post_url(url): enricher.writerow(row) continue post_id = facebook.post_id_from_url(url) if post_id is None: enricher.writerow(row) continue post = client.post(post_id, format='csv_row') enricher.writerow(row, post) except CrowdTangleInvalidTokenError: die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ])
def extract_action(cli_args): if cli_args.glob is None and cli_args.input_dir is None: cli_args.input_dir = DEFAULT_CONTENT_FOLDER input_data = cli_args.report if cli_args.glob is not None: input_data = dummy_csv_file_from_glob(cli_args.glob, cli_args.input_dir) enricher = casanova.enricher( input_data, cli_args.output, keep=cli_args.select, add=OUTPUT_ADDITIONAL_HEADERS ) loading_bar = LoadingBar( desc='Extracting content', total=cli_args.total, unit='doc' ) def on_irrelevant_row(reason, row, i): loading_bar.update() loading_bar.print('Row n°{n} could not be processed: {reason}'.format(n=i + 1, reason=reason)) enricher.writerow(row, format_error(reason)) if ( cli_args.glob is None and 'raw_contents' not in enricher.headers and not isdir(cli_args.input_dir) ): loading_bar.die([ 'Could not find the "%s" directory!' % cli_args.input_dir, 'Did you forget to specify it with -i/--input-dir?' ]) files = create_report_iterator( cli_args, enricher, on_irrelevant_row=on_irrelevant_row ) pool = LazyPool(cli_args.processes) loading_bar.update_stats(p=pool.processes) with pool: for error, row, result in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: enricher.writerow(row, format_error(report_error(error))) continue if result is None: enricher.writerow(row, format_error('no-result')) continue enricher.writerow(row, result)
def comments_action(cli_args): enricher = casanova.enricher(cli_args.file, cli_args.output, add=YOUTUBE_COMMENT_CSV_HEADERS, keep=cli_args.select) loading_bar = LoadingBar('Retrieving comments', unit='comment', stats={'videos': 0}) def before_sleep_until_midnight(seconds): loading_bar.print( 'API limits reached. Will now wait until midnight Pacific time!') client = YouTubeAPIClient( cli_args.key, before_sleep_until_midnight=before_sleep_until_midnight) for row, video in enricher.cells(cli_args.column, with_rows=True): generator = client.comments(video) for comment in generator: loading_bar.update() enricher.writerow(row, comment.as_csv_row()) loading_bar.inc('videos')
def extract_action(namespace): output_file = open_output_file(namespace.output) enricher = casanova.enricher(namespace.report, output_file, keep=namespace.select, add=OUTPUT_ADDITIONAL_HEADERS) loading_bar = tqdm(desc='Extracting content', total=namespace.total, dynamic_ncols=True, unit=' docs') files = create_report_iterator(namespace, enricher, loading_bar=loading_bar) with Pool(namespace.processes) as pool: for error, row, result in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: enricher.writerow(row, [report_error(error)] + PADDING) continue if result is None: enricher.writerow(row, ['no-content'] + PADDING) continue enricher.writerow(row, result) output_file.close()
def filter_and_enrich_tweets_from_csv(f, cat_urls, of=sys.stdout, total=None): categories = list(cat_urls.keys()) casa = casanova.enricher(f, of, add=["matched_urls", "webentities"] + categories) links_pos = casa.pos.links try: for row in tqdm(casa, total=total): links = [normalize_url(u) for u in row[links_pos].split('|')] if not links: continue matched_urls = [] webentities = set() cat_belongings = [] for cat in categories: cat_match = False for we, urls in cat_urls[cat].items(): for u in links: if u in urls: cat_match = True matched_urls.append(u) webentities.add(we) links.remove(u) cat_belongings.append(cat_match) if webentities: casa.writerow(row, ["|".join(matched_urls), "|".join(webentities)] + cat_belongings) except Exception as e: print("ERROR while processing", row, file=sys.stderr) raise (e)
def url_extract_action(cli_args): enricher = casanova.enricher(cli_args.file, cli_args.output, add=REPORT_HEADERS, keep=cli_args.select) extract = EXTRACTORS[getattr(cli_args, 'from')] loading_bar = LoadingBar(desc='Extracting', unit='row', total=cli_args.total) for row, content in enricher.cells(cli_args.column, with_rows=True): loading_bar.update() content = content.strip() if not content: continue for url in extract(content): if cli_args.base_url is not None: url = urljoin(cli_args.base_url, url) enricher.writerow(row, [url])
def captions_action(cli_args): enricher = casanova.enricher( cli_args.file, cli_args.output, add=YOUTUBE_CAPTIONS_CSV_HEADERS, keep=cli_args.select ) loading_bar = LoadingBar( 'Retrieving captions', unit='video' ) for row, video in enricher.cells(cli_args.column, with_rows=True): loading_bar.update() result = get_video_captions(video, langs=cli_args.lang) if result is None: continue track, lines = result prefix = [track.lang, '1' if track.generated else ''] for line in lines: enricher.writerow(row, prefix + list(line))
def search_action(cli_args): enricher = casanova.enricher(cli_args.file, cli_args.output, add=YOUTUBE_VIDEO_SNIPPET_CSV_HEADERS, keep=cli_args.select) loading_bar = LoadingBar('Searching videos', unit='video') def before_sleep_until_midnight(seconds): loading_bar.print( 'API limits reached. Will now wait until midnight Pacific time!') client = YouTubeAPIClient( cli_args.key, before_sleep_until_midnight=before_sleep_until_midnight) for row, query in enricher.cells(cli_args.column, with_rows=True): loading_bar.print('Searching for "%s"' % query) searcher = client.search(query, order=cli_args.order) if cli_args.limit: searcher = islice(searcher, cli_args.limit) for video in searcher: loading_bar.update() enricher.writerow(row, video.as_csv_row())
def videos_action(namespace, output_file): enricher = casanova.enricher(namespace.file, output_file, add=REPORT_HEADERS, keep=namespace.select) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' videos', ) http = create_pool() column = namespace.column def rows_with_videos_id(): for row, ytb_data in enricher.cells(namespace.column, with_rows=True): video_id = None if is_youtube_video_id(ytb_data): video_id = ytb_data elif is_youtube_url(ytb_data): video_id = extract_video_id_from_youtube_url(ytb_data) yield row, video_id for chunk in chunks_iter(rows_with_videos_id(), 50): all_ids = [video_id for _, video_id in chunk if video_id] list_id = ",".join(all_ids) url = URL_TEMPLATE % {'list_id': list_id, 'key': namespace.key} err, response, result = request_json(http, url) if err: die(err) elif response.status == 403: time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: die(response.status) data = get_data(result) not_available = [] id_available = set(data) not_available = set(all_ids).difference(id_available) loading_bar.update(len(chunk)) line_empty = [] for row, video_id in chunk: if video_id is None or video_id in not_available: enricher.writerow(row) else: enricher.writerow(row, data[video_id])
def twitter_friends_action(namespace, output_file): TWITTER = { 'access_token': namespace.access_token, 'access_token_secret': namespace.access_token_secret, 'api_key': namespace.api_key, 'api_secret_key': namespace.api_secret_key } wrapper = TwitterWrapper(TWITTER) enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=REPORT_HEADERS) loading_bar = tqdm(desc='Retrieving ids', dynamic_ncols=True, total=namespace.total, unit=' line') for row, user_id in enricher.cells(namespace.column, with_rows=True): all_ids = [] result = wrapper.call('friends.ids', args={'user_id': user_id}) if result is not None: all_ids = result.get('ids', None) for friend_id in all_ids: enricher.writerow(row, [friend_id]) loading_bar.update() loading_bar.close()
def url_extract_action(namespace): output_file = open_output_file(namespace.output) enricher = casanova.enricher( namespace.file, output_file, add=REPORT_HEADERS, keep=namespace.select.split(',') if namespace.select else None) extract = EXTRACTORS[getattr(namespace, 'from')] loading_bar = tqdm(desc='Extracting', dynamic_ncols=True, unit=' rows', total=namespace.total) for row, content in enricher.cells(namespace.column, with_rows=True): loading_bar.update() content = content.strip() if not content: continue for url in extract(content): if namespace.base_url is not None: url = urljoin(namespace.base_url, url) enricher.writerow(row, [url]) output_file.close()
def url_join_action(cli_args): left_reader = casanova.reader(cli_args.file1) left_headers = left_reader.fieldnames left_idx = None if cli_args.select: left_idx = left_reader.pos.collect(cli_args.select) left_headers = list(cli_args.select) # Applying column prefix now left_headers = [cli_args.match_column_prefix + h for h in left_headers] right_enricher = casanova.enricher(cli_args.file2, cli_args.output, add=left_headers) loading_bar = LoadingBar(desc='Indexing left file', unit='line') # First step is to index left file trie = NormalizedLRUTrie() for row, cell in left_reader.cells(cli_args.column1, with_rows=True): loading_bar.update() if left_idx is not None: row = [row[i] for i in left_idx] urls = [cell] if cli_args.separator is not None: urls = cell.split(cli_args.separator) for url in urls: url = url.strip() # NOTE: should we filter invalid urls here? if url: trie.set(url, row) loading_bar.close() loading_bar = LoadingBar(desc='Matching right file', unit='line') for row, url in right_enricher.cells(cli_args.column2, with_rows=True): loading_bar.update() url = url.strip() match = None # NOTE: should we filter invalid urls here? if url: match = trie.match(url) if match is None: right_enricher.writerow(row) continue right_enricher.writerow(row, match)
def facebook_posts_action(cli_args): try: scraper = FacebookMobileScraper(cli_args.cookie, throttle=cli_args.throttle) except FacebookInvalidCookieError: if cli_args.cookie in COOKIE_BROWSERS: die([ 'Could not extract relevant cookie from "%s".' % cli_args.cookie ]) die([ 'Relevant cookie not found.', 'A Facebook authentication cookie is necessary to be able to scrape Facebook groups.', 'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.' ]) # Enricher enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=FACEBOOK_POST_CSV_HEADERS ) # Loading bar loading_bar = LoadingBar( desc='Scraping posts', unit='post' ) translated_langs = set() for i, (row, url) in enumerate(enricher.cells(cli_args.column, with_rows=True), 1): loading_bar.inc('groups') try: posts = scraper.posts(url) except FacebookInvalidTargetError: loading_bar.print('Given url (line %i) is probably not a Facebook group: %s' % (i, url)) continue for post in posts: if post.translated_text and post.translated_from not in translated_langs: translated_langs.add(post.translated_from) lines = [ 'Found text translated from %s!' % post.translated_from, 'Since it means original text may not be entirely retrieved you might want', 'to edit your Facebook language settings to add "%s" to' % post.translated_from, 'the "Languages you don\'t want to be offered translations for" list here:', 'https://www.facebook.com/settings/?tab=language' ] for line in lines: loading_bar.print(line) loading_bar.print() loading_bar.update() enricher.writerow(row, post.as_csv_row())
def test_combined_pos(self, tmpdir): output_path = str(tmpdir.join('./enriched.csv')) with open('./test/resources/people.csv') as f, \ open(output_path, 'w', newline='') as of: enricher = casanova.enricher(f, of, add=('line',), keep=('surname',)) assert len(enricher.output_headers) == 2 assert enricher.output_headers.surname == 0 assert enricher.output_headers.line == 1
def crowdtangle_summary_action(namespace, output_file): if not namespace.start_date: die('Missing --start-date!') if is_url(namespace.column): edit_namespace_with_csv_io(namespace, 'url') enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select.split(',') if namespace.select else None, add=CROWDTANGLE_SUMMARY_CSV_HEADERS) posts_writer = None if namespace.posts is not None: posts_writer = csv.writer(namespace.posts) posts_writer.writerow(CROWDTANGLE_POST_CSV_HEADERS_WITH_LINK) loading_bar = tqdm(desc='Collecting data', dynamic_ncols=True, total=namespace.total, unit=' urls') client = CrowdTangleAPIClient(namespace.token, rate_limit=namespace.rate_limit) for row, url in enricher.cells(namespace.column, with_rows=True): url = url.strip() try: stats = client.summary(url, start_date=namespace.start_date, with_top_posts=namespace.posts is not None, sort_by=namespace.sort_by, format='csv_row', platforms=namespace.platforms) except CrowdTangleInvalidTokenError: die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ]) except Exception as err: raise err if namespace.posts is not None: stats, posts = stats if posts is not None: for post in posts: posts_writer.writerow([url] + post) enricher.writerow(row, stats) loading_bar.update()
def twitter_users_action(cli_args): client = TwitterAPIClient( cli_args.access_token, cli_args.access_token_secret, cli_args.api_key, cli_args.api_secret_key ) enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=USER_FIELDS ) loading_bar = LoadingBar( desc='Retrieving users', total=cli_args.total, unit='user' ) for chunk in as_chunks(100, enricher.cells(cli_args.column, with_rows=True)): users = ','.join(row[1].lstrip('@') for row in chunk) if cli_args.ids: client_args = {'user_id': users} key = 'id' else: client_args = {'screen_name': users} key = 'screen_name' try: result = client.call(['users', 'lookup'], **client_args) except TwitterHTTPError as e: if e.e.code == 404: for row, user in chunk: enricher.writerow(row) else: raise e continue indexed_result = {} for user in result: user = normalize_user(user) user_row = format_user_as_csv_row(user) indexed_result[user[key]] = user_row for row, user in chunk: user_row = indexed_result.get(user.lstrip('@')) enricher.writerow(row, user_row) loading_bar.update(len(chunk))
def facebook_comments_action(namespace): # Handling output output_file = open_output_file(namespace.output) # Handling input if is_url(namespace.column): edit_namespace_with_csv_io(namespace, 'post_url') try: scraper = FacebookCommentScraper(namespace.cookie) except FacebookInvalidCookieError: if namespace.cookie in ['firefox', 'chrome']: die('Could not extract cookies from %s.' % namespace.cookie) die([ 'Relevant cookie not found.', 'A Facebook authentication cookie is necessary to be able to access Facebook post comments.', 'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.' ]) # Enricher enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=FACEBOOK_COMMENT_CSV_HEADERS) # Loading bar loading_bar = tqdm(desc='Scraping comments', dynamic_ncols=True, unit=' comments') for i, (row, url) in enumerate(enricher.cells(namespace.column, with_rows=True)): if not is_facebook_post_url(url): loading_bar.close() die('Given url (line %i) is not a Facebook post url: %s' % (i + 1, url)) batches = scraper(url, per_call=True, detailed=True, format='csv_row') for details, batch in batches: for comment in batch: enricher.writerow(row, comment) loading_bar.update(len(batch)) loading_bar.set_postfix(calls=details['calls'], replies=details['replies'], q=details['queue_size'], posts=i + 1) loading_bar.close()
def action(namespace, output_file): # TODO: this is temp debug def listener(event, data): tqdm.write(event, file=sys.stderr) tqdm.write(repr(data), file=sys.stderr) wrapper = TwitterWrapper(namespace.access_token, namespace.access_token_secret, namespace.api_key, namespace.api_secret_key, listener=listener) enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=csv_headers) loading_bar = tqdm(desc='Retrieving ids', dynamic_ncols=True, total=namespace.total, unit=' followers', postfix={'users': 0}) users_done = 0 for row, user in enricher.cells(namespace.column, with_rows=True): all_ids = [] next_cursor = -1 result = None if namespace.id: wrapper_kwargs = {'user_id': user} else: wrapper_kwargs = {'screen_name': user} while next_cursor != 0: wrapper_kwargs['cursor'] = next_cursor result = wrapper.call([method_name, 'ids'], **wrapper_kwargs) if result is not None: all_ids = result.get('ids', []) next_cursor = result.get('next_cursor', 0) loading_bar.update(len(all_ids)) for user_id in all_ids: enricher.writerow(row, [user_id]) else: next_cursor = 0 users_done += 1 loading_bar.set_postfix(users=users_done) loading_bar.close()
def twitter_scrape_action(cli_args): scraper = TwitterAPIScraper() # Stats loading_bar = LoadingBar('Collecting tweets', total=cli_args.limit, unit='tweet', stats={ 'tokens': 1, 'queries': 0 }) enricher = casanova.enricher(cli_args.file, cli_args.output, add=TWEET_FIELDS + ADDITIONAL_TWEET_FIELDS, keep=cli_args.select) def before_sleep(retry_state): exc = retry_state.outcome.exception() if isinstance(exc, TwitterPublicAPIRateLimitError): loading_bar.inc('tokens') else: loading_bar.inc('failures') loading_bar.print( 'Failed to call Twitter search. Will retry in %s' % prettyprint_seconds(retry_state.idle_for)) for row, query in enricher.cells(cli_args.query, with_rows=True): # Templating? if cli_args.query_template is not None: query = CUSTOM_FORMATTER.format(cli_args.query_template, value=query) loading_bar.print('Searching for "%s"' % query) loading_bar.inc('queries') iterator = scraper.search( query, limit=cli_args.limit, before_sleep=before_sleep, include_referenced_tweets=cli_args.include_refs, with_meta=True) try: for tweet, meta in iterator: loading_bar.update() tweet_row = format_tweet_as_csv_row(tweet) enricher.writerow(row, tweet_row + format_meta_row(meta)) except TwitterPublicAPIOverCapacityError: loading_bar.die('Got an "Over Capacity" error. Shutting down...')
def url_parse_action(namespace): output_file = open_output_file(namespace.output) headers = REPORT_HEADERS if namespace.facebook: headers = FACEBOOK_REPORT_HEADERS elif namespace.youtube: headers = YOUTUBE_REPORT_HEADERS enricher = casanova.enricher( namespace.file, output_file, add=headers, keep=namespace.select ) loading_bar = tqdm( desc='Parsing', dynamic_ncols=True, unit=' rows', total=namespace.total ) for row, url in enricher.cells(namespace.column, with_rows=True): url = url.strip() loading_bar.update() if namespace.separator: urls = url.split(namespace.separator) else: urls = [url] for url in urls: if not is_url(url, allow_spaces_in_path=True, require_protocol=False): enricher.writerow(row) continue if namespace.facebook: addendum = extract_facebook_addendum(url) elif namespace.youtube: addendum = extract_youtube_addendum(url) else: addendum = extract_standard_addendum(namespace, url) if addendum is None: enricher.writerow(row) continue enricher.writerow(row, addendum) output_file.close()
def crowdtangle_summary_action(cli_args): if not cli_args.start_date: die('Missing --start-date!') enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=CROWDTANGLE_SUMMARY_CSV_HEADERS ) posts_writer = None if cli_args.posts is not None: posts_writer = csv.writer(cli_args.posts) posts_writer.writerow(CROWDTANGLE_POST_CSV_HEADERS_WITH_LINK) loading_bar = LoadingBar( desc='Collecting data', total=cli_args.total, unit='url' ) client = CrowdTangleAPIClient(cli_args.token, rate_limit=cli_args.rate_limit) for row, url in enricher.cells(cli_args.column, with_rows=True): url = url.strip() try: stats = client.summary( url, start_date=cli_args.start_date, with_top_posts=cli_args.posts is not None, sort_by=cli_args.sort_by, platforms=cli_args.platforms ) except CrowdTangleInvalidTokenError: die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ]) if cli_args.posts is not None: stats, posts = stats if posts is not None: for post in posts: posts_writer.writerow(post.as_csv_row()) enricher.writerow(row, stats.as_csv_row() if stats is not None else None) loading_bar.update()
def extract_action(namespace): output_file = open_output_file(namespace.output) enricher = casanova.enricher( namespace.report, output_file, keep=namespace.select, add=OUTPUT_ADDITIONAL_HEADERS ) loading_bar = LoadingBar( desc='Extracting content', total=namespace.total, unit='doc' ) def on_irrelevant_row(reason, row): loading_bar.update() enricher.writerow(row, format_error(reason)) try: files = create_report_iterator( namespace, enricher, on_irrelevant_row=on_irrelevant_row ) except NotADirectoryError: loading_bar.die([ 'Could not find the "%s" directory!' % namespace.input_dir, 'Did you forget to specify it with -i/--input-dir?' ]) pool = LazyPool(namespace.processes) loading_bar.update_stats(p=pool.processes) with pool: for error, row, result in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: enricher.writerow(row, format_error(report_error(error))) continue if result is None: enricher.writerow(row, format_error('no-content')) continue enricher.writerow(row, result) loading_bar.close() output_file.close()
def action(namespace, output_file): TWITTER = { 'access_token': namespace.access_token, 'access_token_secret': namespace.access_token_secret, 'api_key': namespace.api_key, 'api_secret_key': namespace.api_secret_key } wrapper = TwitterWrapper(TWITTER) enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=csv_headers ) loading_bar = tqdm( desc='Retrieving ids', dynamic_ncols=True, total=namespace.total, unit=' line' ) for row, user in enricher.cells(namespace.column, with_rows=True): all_ids = [] next_cursor = -1 result = None if namespace.id: wrapper_args = {'user_id': user} else: wrapper_args = {'screen_name': user} while next_cursor != 0: wrapper_args['cursor'] = next_cursor method = '%(method_name)s.ids' % {'method_name': method_name} result = wrapper.call(method, wrapper_args) if result is not None: all_ids = result.get('ids', []) next_cursor = result.get('next_cursor', 0) for friend_id in all_ids: enricher.writerow(row, [friend_id]) else: next_cursor = 0 loading_bar.update() loading_bar.close()
def test_dialect(self, tmpdir): output_path = str(tmpdir.join('./enriched.csv')) with open('./test/resources/semicolons.csv') as f, \ open(output_path, 'w', newline='') as of: enricher = casanova.enricher(f, of, add=('line',), delimiter=';') for i, row in enumerate(enricher): enricher.writerow(row, [i]) assert collect_csv(output_path) == [ ['name', 'surname', 'line'], ['Rose', 'Philips', '0'], ['Luke', 'Atman', '1'] ]