Esempio n. 1
0
 def __init__(self,
              access_token,
              access_token_secret,
              api_key,
              api_secret_key,
              before_sleep=None):
     self.wrapper = TwitterWrapper(access_token, access_token_secret,
                                   api_key, api_secret_key)
     self.retryer = create_request_retryer(before_sleep=before_sleep)
Esempio n. 2
0
def twitter_users_action(cli_args):

    wrapper = TwitterWrapper(
        cli_args.access_token,
        cli_args.access_token_secret,
        cli_args.api_key,
        cli_args.api_secret_key
    )

    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=USER_FIELDS
    )

    loading_bar = LoadingBar(
        desc='Retrieving users',
        total=cli_args.total,
        unit='user'
    )

    for chunk in as_chunks(100, enricher.cells(cli_args.column, with_rows=True)):
        users = ','.join(row[1] for row in chunk)

        if cli_args.ids:
            wrapper_args = {'user_id': users}
            key = 'id'
        else:
            wrapper_args = {'screen_name': users}
            key = 'screen_name'

        try:
            result = wrapper.call(['users', 'lookup'], **wrapper_args)
        except TwitterHTTPError as e:
            if e.e.code == 404:
                for row, user in chunk:
                    enricher.writerow(row, user_row)
            else:
                raise e

        if result is not None:
            indexed_result = {}

            for user in result:
                user = normalize_user(user)
                user_row = format_user_as_csv_row(user)
                indexed_result[user[key]] = user_row

            for row, user in chunk:
                user_row = indexed_result.get(user)

                enricher.writerow(row, user_row)

        loading_bar.update(len(chunk))
Esempio n. 3
0
    def action(namespace, output_file):

        # TODO: this is temp debug
        def listener(event, data):
            tqdm.write(event, file=sys.stderr)
            tqdm.write(repr(data), file=sys.stderr)

        wrapper = TwitterWrapper(namespace.access_token,
                                 namespace.access_token_secret,
                                 namespace.api_key,
                                 namespace.api_secret_key,
                                 listener=listener)

        enricher = casanova.enricher(namespace.file,
                                     output_file,
                                     keep=namespace.select,
                                     add=csv_headers)

        loading_bar = tqdm(desc='Retrieving ids',
                           dynamic_ncols=True,
                           total=namespace.total,
                           unit=' followers',
                           postfix={'users': 0})

        users_done = 0

        for row, user in enricher.cells(namespace.column, with_rows=True):
            all_ids = []
            next_cursor = -1
            result = None

            if namespace.id:
                wrapper_kwargs = {'user_id': user}
            else:
                wrapper_kwargs = {'screen_name': user}

            while next_cursor != 0:
                wrapper_kwargs['cursor'] = next_cursor
                result = wrapper.call([method_name, 'ids'], **wrapper_kwargs)

                if result is not None:
                    all_ids = result.get('ids', [])
                    next_cursor = result.get('next_cursor', 0)

                    loading_bar.update(len(all_ids))

                    for user_id in all_ids:
                        enricher.writerow(row, [user_id])
                else:
                    next_cursor = 0

            users_done += 1
            loading_bar.set_postfix(users=users_done)

        loading_bar.close()
Esempio n. 4
0
def twitter_users_action(namespace, output_file):

    wrapper = TwitterWrapper(namespace.access_token,
                             namespace.access_token_secret, namespace.api_key,
                             namespace.api_secret_key)

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=USER_FIELDS)

    loading_bar = tqdm(desc='Retrieving users',
                       dynamic_ncols=True,
                       total=namespace.total,
                       unit=' user')

    for chunk in as_chunks(
            enricher.cells(100, namespace.column, with_rows=True)):
        users = ','.join(row[1] for row in chunk)

        if namespace.ids:
            wrapper_args = {'user_id': users}
            key = 'id'
        else:
            wrapper_args = {'screen_name': users}
            key = 'screen_name'

        result = wrapper.call(['users', 'lookup'], **wrapper_args)

        if result is not None:
            indexed_result = {}

            for user in result:
                user = normalize_user(user)
                user_row = format_user_as_csv_row(user)
                indexed_result[user[key]] = user_row

            for row, user in chunk:
                user_row = indexed_result.get(user)

                enricher.writerow(row, user_row)

        loading_bar.update(len(chunk))

    loading_bar.close()
Esempio n. 5
0
class TwitterAPIClient(object):
    def __init__(self,
                 access_token,
                 access_token_secret,
                 api_key,
                 api_secret_key,
                 before_sleep=None):
        self.wrapper = TwitterWrapper(access_token, access_token_secret,
                                      api_key, api_secret_key)
        self.retryer = create_request_retryer(before_sleep=before_sleep)

    @retrying_method()
    def call(self, *args, **kwargs):
        return self.wrapper.call(*args, **kwargs)
Esempio n. 6
0
    def action(cli_args):
        enricher = casanova.batch_enricher(cli_args.file,
                                           cli_args.output,
                                           keep=cli_args.select,
                                           add=csv_headers)

        loading_bar = LoadingBar(desc='Retrieving ids',
                                 unit=method_name[:-1],
                                 stats={'users': 0})

        # TODO: this is temp debug
        def listener(event, data):
            loading_bar.print(event)
            loading_bar.print(repr(data))

        wrapper = TwitterWrapper(cli_args.access_token,
                                 cli_args.access_token_secret,
                                 cli_args.api_key,
                                 cli_args.api_secret_key,
                                 listener=listener)

        resuming_state = None

        if cli_args.resume:
            resuming_state = cli_args.output.pop_state()

        for row, user in enricher.cells(cli_args.column, with_rows=True):
            loading_bar.update_stats(user=user)

            all_ids = []
            next_cursor = -1
            result = None

            if resuming_state is not None and resuming_state.last_cursor:
                next_cursor = int(resuming_state.last_cursor)

            if cli_args.ids:
                wrapper_kwargs = {'user_id': user}
            else:
                wrapper_kwargs = {'screen_name': user}

            while next_cursor != 0:
                wrapper_kwargs['cursor'] = next_cursor

                skip_in_output = None

                if resuming_state:
                    skip_in_output = resuming_state.values_to_skip
                    resuming_state = None

                try:
                    result = wrapper.call([method_name, 'ids'],
                                          **wrapper_kwargs)
                except TwitterHTTPError as e:

                    # The user does not exist
                    loading_bar.inc('users_not_found')
                    break

                if result is not None:
                    all_ids = result.get('ids', [])
                    next_cursor = result.get('next_cursor', 0)

                    loading_bar.update(len(all_ids))

                    batch = []

                    for user_id in all_ids:
                        if skip_in_output and user_id in skip_in_output:
                            continue

                        batch.append([user_id])

                    enricher.writebatch(row, batch, next_cursor or None)
                else:
                    next_cursor = 0

            loading_bar.inc('users')
Esempio n. 7
0
def twitter_user_tweets_action(namespace, output_file):

    wrapper = TwitterWrapper(namespace.access_token,
                             namespace.access_token_secret, namespace.api_key,
                             namespace.api_secret_key)

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=TWEET_FIELDS)

    loading_bar = LoadingBar('Retrieving tweets',
                             total=namespace.total,
                             unit='tweet')

    for row, user in enricher.cells(namespace.column, with_rows=True):
        max_id = None

        loading_bar.update_stats(user=user)

        while True:
            if namespace.ids:
                kwargs = {'user_id': user}
            else:
                kwargs = {'screen_name': user}

            kwargs['include_rts'] = not namespace.exclude_retweets
            kwargs['count'] = TWITTER_API_MAX_STATUSES_COUNT
            kwargs['tweet_mode'] = 'extended'

            if max_id is not None:
                kwargs['max_id'] = max_id

            loading_bar.inc('calls')

            try:
                tweets = wrapper.call(['statuses', 'user_timeline'], **kwargs)
            except TwitterHTTPError as e:
                loading_bar.inc('errors')

                if e.e.code == 404:
                    loading_bar.print('Could not find user "%s"' % user)
                else:
                    loading_bar.print(
                        'An error happened when attempting to retrieve tweets from "%s"'
                        % user)

                break

            if not tweets:
                break

            loading_bar.update(len(tweets))

            max_id = min(int(tweet['id_str']) for tweet in tweets) - 1

            for tweet in tweets:
                tweet = normalize_tweet(tweet, collection_source='api')
                addendum = format_tweet_as_csv_row(tweet)

                enricher.writerow(row, addendum)

        loading_bar.inc('done')

    loading_bar.close()
Esempio n. 8
0
    def action(namespace, output_file):

        # TODO: this is temp debug
        def listener(event, data):
            tqdm.write(event, file=sys.stderr)
            tqdm.write(repr(data), file=sys.stderr)

        wrapper = TwitterWrapper(namespace.access_token,
                                 namespace.access_token_secret,
                                 namespace.api_key,
                                 namespace.api_secret_key,
                                 listener=listener)

        enricher = casanova.enricher(namespace.file,
                                     output_file,
                                     keep=namespace.select,
                                     add=csv_headers + ['cursor'],
                                     resumable=namespace.resume,
                                     auto_resume=False)

        loading_bar = tqdm(desc='Retrieving ids',
                           dynamic_ncols=True,
                           total=namespace.total,
                           unit=' followers',
                           postfix={'users': 0})

        users_done = 0
        users_not_found = 0
        skipped = 0

        def update_stats():
            kwargs = {'users': users_done}

            if users_not_found:
                kwargs['not_found'] = users_not_found

            if skipped:
                kwargs['skipped'] = skipped

            loading_bar.set_postfix(**kwargs)

        last_batch = None

        if namespace.resume:
            # TODO: sacralize this in specialized casanova enricher
            last_batch = casanova.reverse_reader.last_batch(
                output_file.name,
                batch_value=namespace.column,
                batch_cursor='cursor',
                end_symbol='end')

        for row, user in enricher.cells(namespace.column, with_rows=True):
            if last_batch:
                if user != last_batch.value:
                    skipped += 1
                    update_stats()
                    continue

                if user == last_batch.value and last_batch.finished:
                    last_batch = None
                    skipped += 1
                    update_stats()
                    continue

            all_ids = []
            next_cursor = -1
            result = None

            if last_batch and last_batch.cursor:
                next_cursor = last_batch.cursor

            if namespace.ids:
                wrapper_kwargs = {'user_id': user}
            else:
                wrapper_kwargs = {'screen_name': user}

            while next_cursor != 0:
                wrapper_kwargs['cursor'] = next_cursor

                skip_in_output = None

                if last_batch:
                    skip_in_output = set(row[-2] for row in last_batch.rows)
                    last_batch = None

                try:
                    result = wrapper.call([method_name, 'ids'],
                                          **wrapper_kwargs)
                except TwitterHTTPError as e:

                    # The user does not exist
                    users_not_found += 1
                    update_stats()
                    break

                if result is not None:
                    all_ids = result.get('ids', [])
                    next_cursor = result.get('next_cursor', 0)

                    loading_bar.update(len(all_ids))

                    for is_last, user_id in with_is_last(all_ids):
                        if skip_in_output and user_id in skip_in_output:
                            continue

                        if is_last:
                            addendum = [user_id, next_cursor or 'end']
                        else:
                            addendum = [user_id, '']

                        enricher.writerow(row, addendum)
                else:
                    next_cursor = 0

            users_done += 1
            update_stats()

        loading_bar.close()