def __init__(self, access_token, access_token_secret, api_key, api_secret_key, before_sleep=None): self.wrapper = TwitterWrapper(access_token, access_token_secret, api_key, api_secret_key) self.retryer = create_request_retryer(before_sleep=before_sleep)
def twitter_users_action(cli_args): wrapper = TwitterWrapper( cli_args.access_token, cli_args.access_token_secret, cli_args.api_key, cli_args.api_secret_key ) enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=USER_FIELDS ) loading_bar = LoadingBar( desc='Retrieving users', total=cli_args.total, unit='user' ) for chunk in as_chunks(100, enricher.cells(cli_args.column, with_rows=True)): users = ','.join(row[1] for row in chunk) if cli_args.ids: wrapper_args = {'user_id': users} key = 'id' else: wrapper_args = {'screen_name': users} key = 'screen_name' try: result = wrapper.call(['users', 'lookup'], **wrapper_args) except TwitterHTTPError as e: if e.e.code == 404: for row, user in chunk: enricher.writerow(row, user_row) else: raise e if result is not None: indexed_result = {} for user in result: user = normalize_user(user) user_row = format_user_as_csv_row(user) indexed_result[user[key]] = user_row for row, user in chunk: user_row = indexed_result.get(user) enricher.writerow(row, user_row) loading_bar.update(len(chunk))
def action(namespace, output_file): # TODO: this is temp debug def listener(event, data): tqdm.write(event, file=sys.stderr) tqdm.write(repr(data), file=sys.stderr) wrapper = TwitterWrapper(namespace.access_token, namespace.access_token_secret, namespace.api_key, namespace.api_secret_key, listener=listener) enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=csv_headers) loading_bar = tqdm(desc='Retrieving ids', dynamic_ncols=True, total=namespace.total, unit=' followers', postfix={'users': 0}) users_done = 0 for row, user in enricher.cells(namespace.column, with_rows=True): all_ids = [] next_cursor = -1 result = None if namespace.id: wrapper_kwargs = {'user_id': user} else: wrapper_kwargs = {'screen_name': user} while next_cursor != 0: wrapper_kwargs['cursor'] = next_cursor result = wrapper.call([method_name, 'ids'], **wrapper_kwargs) if result is not None: all_ids = result.get('ids', []) next_cursor = result.get('next_cursor', 0) loading_bar.update(len(all_ids)) for user_id in all_ids: enricher.writerow(row, [user_id]) else: next_cursor = 0 users_done += 1 loading_bar.set_postfix(users=users_done) loading_bar.close()
def twitter_users_action(namespace, output_file): wrapper = TwitterWrapper(namespace.access_token, namespace.access_token_secret, namespace.api_key, namespace.api_secret_key) enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=USER_FIELDS) loading_bar = tqdm(desc='Retrieving users', dynamic_ncols=True, total=namespace.total, unit=' user') for chunk in as_chunks( enricher.cells(100, namespace.column, with_rows=True)): users = ','.join(row[1] for row in chunk) if namespace.ids: wrapper_args = {'user_id': users} key = 'id' else: wrapper_args = {'screen_name': users} key = 'screen_name' result = wrapper.call(['users', 'lookup'], **wrapper_args) if result is not None: indexed_result = {} for user in result: user = normalize_user(user) user_row = format_user_as_csv_row(user) indexed_result[user[key]] = user_row for row, user in chunk: user_row = indexed_result.get(user) enricher.writerow(row, user_row) loading_bar.update(len(chunk)) loading_bar.close()
class TwitterAPIClient(object): def __init__(self, access_token, access_token_secret, api_key, api_secret_key, before_sleep=None): self.wrapper = TwitterWrapper(access_token, access_token_secret, api_key, api_secret_key) self.retryer = create_request_retryer(before_sleep=before_sleep) @retrying_method() def call(self, *args, **kwargs): return self.wrapper.call(*args, **kwargs)
def action(cli_args): enricher = casanova.batch_enricher(cli_args.file, cli_args.output, keep=cli_args.select, add=csv_headers) loading_bar = LoadingBar(desc='Retrieving ids', unit=method_name[:-1], stats={'users': 0}) # TODO: this is temp debug def listener(event, data): loading_bar.print(event) loading_bar.print(repr(data)) wrapper = TwitterWrapper(cli_args.access_token, cli_args.access_token_secret, cli_args.api_key, cli_args.api_secret_key, listener=listener) resuming_state = None if cli_args.resume: resuming_state = cli_args.output.pop_state() for row, user in enricher.cells(cli_args.column, with_rows=True): loading_bar.update_stats(user=user) all_ids = [] next_cursor = -1 result = None if resuming_state is not None and resuming_state.last_cursor: next_cursor = int(resuming_state.last_cursor) if cli_args.ids: wrapper_kwargs = {'user_id': user} else: wrapper_kwargs = {'screen_name': user} while next_cursor != 0: wrapper_kwargs['cursor'] = next_cursor skip_in_output = None if resuming_state: skip_in_output = resuming_state.values_to_skip resuming_state = None try: result = wrapper.call([method_name, 'ids'], **wrapper_kwargs) except TwitterHTTPError as e: # The user does not exist loading_bar.inc('users_not_found') break if result is not None: all_ids = result.get('ids', []) next_cursor = result.get('next_cursor', 0) loading_bar.update(len(all_ids)) batch = [] for user_id in all_ids: if skip_in_output and user_id in skip_in_output: continue batch.append([user_id]) enricher.writebatch(row, batch, next_cursor or None) else: next_cursor = 0 loading_bar.inc('users')
def twitter_user_tweets_action(namespace, output_file): wrapper = TwitterWrapper(namespace.access_token, namespace.access_token_secret, namespace.api_key, namespace.api_secret_key) enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=TWEET_FIELDS) loading_bar = LoadingBar('Retrieving tweets', total=namespace.total, unit='tweet') for row, user in enricher.cells(namespace.column, with_rows=True): max_id = None loading_bar.update_stats(user=user) while True: if namespace.ids: kwargs = {'user_id': user} else: kwargs = {'screen_name': user} kwargs['include_rts'] = not namespace.exclude_retweets kwargs['count'] = TWITTER_API_MAX_STATUSES_COUNT kwargs['tweet_mode'] = 'extended' if max_id is not None: kwargs['max_id'] = max_id loading_bar.inc('calls') try: tweets = wrapper.call(['statuses', 'user_timeline'], **kwargs) except TwitterHTTPError as e: loading_bar.inc('errors') if e.e.code == 404: loading_bar.print('Could not find user "%s"' % user) else: loading_bar.print( 'An error happened when attempting to retrieve tweets from "%s"' % user) break if not tweets: break loading_bar.update(len(tweets)) max_id = min(int(tweet['id_str']) for tweet in tweets) - 1 for tweet in tweets: tweet = normalize_tweet(tweet, collection_source='api') addendum = format_tweet_as_csv_row(tweet) enricher.writerow(row, addendum) loading_bar.inc('done') loading_bar.close()
def action(namespace, output_file): # TODO: this is temp debug def listener(event, data): tqdm.write(event, file=sys.stderr) tqdm.write(repr(data), file=sys.stderr) wrapper = TwitterWrapper(namespace.access_token, namespace.access_token_secret, namespace.api_key, namespace.api_secret_key, listener=listener) enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=csv_headers + ['cursor'], resumable=namespace.resume, auto_resume=False) loading_bar = tqdm(desc='Retrieving ids', dynamic_ncols=True, total=namespace.total, unit=' followers', postfix={'users': 0}) users_done = 0 users_not_found = 0 skipped = 0 def update_stats(): kwargs = {'users': users_done} if users_not_found: kwargs['not_found'] = users_not_found if skipped: kwargs['skipped'] = skipped loading_bar.set_postfix(**kwargs) last_batch = None if namespace.resume: # TODO: sacralize this in specialized casanova enricher last_batch = casanova.reverse_reader.last_batch( output_file.name, batch_value=namespace.column, batch_cursor='cursor', end_symbol='end') for row, user in enricher.cells(namespace.column, with_rows=True): if last_batch: if user != last_batch.value: skipped += 1 update_stats() continue if user == last_batch.value and last_batch.finished: last_batch = None skipped += 1 update_stats() continue all_ids = [] next_cursor = -1 result = None if last_batch and last_batch.cursor: next_cursor = last_batch.cursor if namespace.ids: wrapper_kwargs = {'user_id': user} else: wrapper_kwargs = {'screen_name': user} while next_cursor != 0: wrapper_kwargs['cursor'] = next_cursor skip_in_output = None if last_batch: skip_in_output = set(row[-2] for row in last_batch.rows) last_batch = None try: result = wrapper.call([method_name, 'ids'], **wrapper_kwargs) except TwitterHTTPError as e: # The user does not exist users_not_found += 1 update_stats() break if result is not None: all_ids = result.get('ids', []) next_cursor = result.get('next_cursor', 0) loading_bar.update(len(all_ids)) for is_last, user_id in with_is_last(all_ids): if skip_in_output and user_id in skip_in_output: continue if is_last: addendum = [user_id, next_cursor or 'end'] else: addendum = [user_id, ''] enricher.writerow(row, addendum) else: next_cursor = 0 users_done += 1 update_stats() loading_bar.close()