Example #1
0
def scrape_twitter_account_by_id(upstream_ids, stub=False, labels={}):
    """
    Scrape twitter bio data for upstream IDs and/or updates a profile.
    Accepts twitter ID rather than username.
    """
    if len(upstream_ids) > 100:
        raise ScrapeException('Twitter API max is 100 user IDs per request.')

    db_session = worker.get_session()
    profiles = []

    # Request from Twitter API.
    api_url = 'https://api.twitter.com/1.1/users/lookup.json'
    payload = {'user_id': ','.join(upstream_ids)}
    headers = {'ACCEPT-ENCODING': None}
    response = requests.post(
        api_url,
        data=payload,
        proxies=_get_proxies(db_session),
        verify=False,
        headers=TWITTER_HEADERS
    )
    response.raise_for_status()

    # Update the profile.
    for profile_json in response.json():
        profile = Profile(
            'twitter',
            profile_json['id_str'],
            profile_json['screen_name']
        )
        profile.is_stub = stub
        profile.private = profile_json['protected']
        db_session.add(profile)

        try:
            db_session.commit()
        except IntegrityError:
            # Already exists: use the existing profile.
            db_session.rollback()
            profile = db_session.query(Profile) \
                                .filter(Profile.site=='twitter') \
                                .filter(
                                    Profile.upstream_id==profile_json['id_str']
                                )\
                                .one()
            # Profiles already in the system are either not stubs or
            # being updated to full profiles
            profile.is_stub = False


        _twitter_populate_profile(profile_json, profile)

        if profile.upstream_id in labels:
            _label_profile(db_session, profile, labels[profile.upstream_id])

        profile.last_update = datetime.now()
        db_session.commit()
        profiles.append(profile.as_dict())

        # Schedule followup jobs.
        app.queue.schedule_index_profile(profile)
        if not stub:
            app.queue.schedule_avatar(
                profile, profile_json['profile_image_url_https']
            )
            # Only get tweets and relations for unprotected profiles
            if not profile.private:
                app.queue.schedule_posts(profile, recent=True)
                app.queue.schedule_relations(profile)

    return profiles
Example #2
0
def scrape_twitter_account(usernames, stub=False, labels=None):
    """
    Scrape twitter bio data and create (or update) a list of profile
    usernames.

    Keyword arguments:
    stub -- add the profile in stub mode (default False)
    labels -- dictionary of username labels (default None)
    """

    if len(usernames) > 100:
        raise ScrapeException('Twitter API max is 100 user IDs per request.')

    profiles = []
    # Request from Twitter API.
    db_session = worker.get_session()

    api_url = 'https://api.twitter.com/1.1/users/lookup.json'
    payload = {'screen_name': ','.join(usernames)}
    headers = {'ACCEPT-ENCODING': None}
    response = requests.post(
        api_url,
        data=payload,
        proxies=_get_proxies(db_session),
        verify=False,
        headers=TWITTER_HEADERS
    )
    response.raise_for_status()

    # Get Twitter ID and upsert the profile.
    for profile_json in response.json():
        user_id = profile_json['id_str']
        profile = Profile('twitter', user_id, profile_json['screen_name'])
        profile.is_stub = stub
        profile.private = profile_json['protected']
        db_session.add(profile)

        try:
            db_session.commit()
        except IntegrityError:
            # Already exists: use the existing profile.
            db_session.rollback()
            profile = db_session.query(Profile) \
                                .filter(Profile.site=='twitter') \
                                .filter(Profile.upstream_id==user_id) \
                                .one()
            # Profiles already in the system are either not stubs or
            # being updated to full profiles
            profile.is_stub = False

        _twitter_populate_profile(profile_json, profile)

        if profile.username.lower() in labels:
            print('Labels: {}'.format(labels), flush=True)
            _label_profile(db_session, profile, labels[profile.username.lower()])

        profile.last_update = datetime.now()
        db_session.commit()
        profiles.append(profile.as_dict())

        # Schedule followup jobs.
        app.queue.schedule_index_profile(profile)

        if not stub:
            app.queue.schedule_avatar(profile, profile_json['profile_image_url_https'])

            # Only get tweets and relations for unprotected profiles
            if not profile.private:
                app.queue.schedule_posts(profile, recent=True)
                app.queue.schedule_relations(profile)

    return profiles