Exemple #1
0
def sleep_determinate(period):
    ''' Sleep for a specified period of time with progress updates.'''

    total_sleep = 0
    worker.start_job(total=int(math.ceil(period)))

    while total_sleep < period:
        time.sleep(1)
        total_sleep += 1
        worker.update_job(current=total_sleep)

    worker.finish_job()
Exemple #2
0
def scrape_twitter_relations(id_):
    """
    Fetch friends and followers for the Twitter user identified by `id_`.
    The number of friends and followers to fetch is configured in Admin.
    """
    redis = worker.get_redis()
    db = worker.get_session()
    profile = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    max_results = get_config(db, 'max_relations_twitter', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException(
            'Value of max_relations_twitter must be an integer'
        )

    friends_results = 0
    friends_ids = []
    followers_results = 0
    followers_ids = []
    friends_cursor = -1
    followers_cursor = -1

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    params = {
        'count': 5000,
        'user_id': profile.upstream_id,
        'stringify_ids': True,
    }

    # Get friends currently stored in db for this profile.
    friends_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.friend_id == Profile.id)
            ) \
            .filter(profile_join_self.c.follower_id == id_)
    current_friends_ids = [friend.upstream_id for friend in friends_query]


    # Get followers currently stored in db for this profile.
    followers_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.follower_id == Profile.id)
            ) \
            .filter(profile_join_self.c.friend_id == id_)
    current_followers_ids = [follower.upstream_id for follower in followers_query]

    ## Get friend IDs.
    friends_url = 'https://api.twitter.com/1.1/friends/ids.json'
    params['cursor'] = friends_cursor

    while friends_results < max_results:
        friends_response = requests.get(
            friends_url,
            params=params,
            proxies=proxies,
            verify=False,
            headers=TWITTER_HEADERS
        )
        friends_response.raise_for_status()

        # Ignore friends already in the db
        for friend_id in friends_response.json()['ids']:
            if friend_id not in current_friends_ids:
                friends_ids.append(friend_id)
                friends_results += 1
                if friends_results == max_results:
                    break

        friends_cursor = friends_response.json()['next_cursor']

        if friends_cursor == 0:
            break # No more results
        else:
            params['cursor'] = friends_cursor

    # Get follower IDs.
    followers_url = 'https://api.twitter.com/1.1/followers/ids.json'
    params['cursor'] = followers_cursor

    while followers_results < max_results:
        followers_response = requests.get(
            followers_url,
            params=params,
            proxies=proxies,
            verify=False,
            headers=TWITTER_HEADERS,
        )
        followers_response.raise_for_status()

        # Ignore followers already in the db
        for follower_id in followers_response.json()['ids']:
            if follower_id not in current_followers_ids:
                followers_ids.append(follower_id)
                followers_results += 1
                if followers_results == max_results:
                    break

        followers_cursor = followers_response.json()['next_cursor']

        if followers_cursor == 0:
            break # No more results
        else:
            params['cursor'] = followers_cursor

    # Get username for each of the friend/follower IDs and create
    # a relationship in QuickPin.
    user_ids = [(uid, 'friend') for uid in friends_ids] + \
               [(uid, 'follower') for uid in followers_ids]
    worker.start_job(total=len(user_ids))
    chunk_size = 100
    for chunk_start in range(0, len(user_ids), chunk_size):
        chunk_end = chunk_start + chunk_size
        chunk = user_ids[chunk_start:chunk_end]
        chunk_lookup = {id_:relation for id_,relation in chunk}

        lookup_url = 'https://api.twitter.com/1.1/users/lookup.json'
        lookup_response = requests.post(
            lookup_url,
            proxies=_get_proxies(db),
            verify=False,
            headers=TWITTER_HEADERS,
            data={'user_id': ','.join(chunk_lookup.keys())}
        )
        lookup_response.raise_for_status()
        relations = lookup_response.json()

        for related_dict in relations:
            uid = related_dict['id_str']
            username = related_dict['screen_name']
            related_profile = Profile('twitter', uid, username, is_stub=True)
            db.add(related_profile)

            try:
                db.commit()
            except IntegrityError:
                # Already exists: use the existing profile.
                db.rollback()
                related_profile = db \
                    .query(Profile) \
                    .filter(Profile.site=='twitter') \
                    .filter(Profile.upstream_id==uid) \
                    .one()

            _twitter_populate_profile(related_dict, related_profile)
            relation = chunk_lookup[uid]

            if relation == 'friend':
                profile.friends.append(related_profile)
            else: # relation == 'follower':
                profile.followers.append(related_profile)

            db.commit()

        worker.update_job(current=chunk_end)

    db.commit()
    worker.finish_job()
    redis.publish('profile_relations', json.dumps({'id': id_}))
Exemple #3
0
def scrape_instagram_relations(id_):
    """
    Fetch friends and followers for the Instagram user identified by `id_`.
    The number of friends and followers to fetch is configured in Admin.
    """
    redis = worker.get_redis()
    db = worker.get_session()
    profile = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    friends_results = 0
    followers_results = 0
    max_results = get_config(db, 'max_relations_instagram', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException(
            'Value of max_relations_instagram must be an integer'
        )

    friends_params = {}
    followers_params = {}
    total_results = max_results*2

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    # Get friends currently stored in db for this profile.
    friends_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.friend_id == Profile.id)
            ) \
            .filter(profile_join_self.c.follower_id == id_)
    current_friends_ids = [friend.upstream_id for friend in friends_query]

    # Get followers currently stored in db for this profile.
    followers_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.follower_id == Profile.id)
            ) \
            .filter(profile_join_self.c.friend_id == id_)
    current_followers_ids = [follower.upstream_id for follower in followers_query]

    worker.start_job(total=total_results)

    # Get friend IDs.
    friends_url = 'https://api.instagram.com/v1/users/{}/follows' \
                  .format(profile.upstream_id)

    while friends_results < max_results:
        # Get friends from Instagram API
        friends_response = requests.get(
            friends_url,
            params=friends_params,
            proxies=proxies,
            verify=False
        )
        friends_response.raise_for_status()
        pagination = friends_response.json()['pagination']

        for friend in friends_response.json()['data']:
            # Only store friends that are not already in db.
            if friend['id'] not in current_friends_ids:
                related_profile = Profile(
                    'instagram',
                    friend['id'],
                    friend['username'],
                    is_stub=True
                )

                db.add(related_profile)

                try:
                    db.commit()
                except IntegrityError:
                    db.rollback()
                    related_profile = db \
                            .query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==friend['id']) \
                            .one()

                related_profile.name = friend['full_name']
                profile.friends.append(related_profile)
                friends_results += 1
                worker.update_job(current=friends_results)

                if friends_results == max_results:
                    break

        # If there are more results, set the cursor paramater, otherwise finish
        if 'next_cursor' in pagination:
            friends_params['cursor'] = pagination['next_cursor']
        else:
            break # No more results

    # Get follower IDs.
    followers_url = 'https://api.instagram.com/v1/users/{}/followed-by' \
                    .format(profile.upstream_id)

    # Get followers from Instagram API
    while followers_results < max_results:
        # Get friends from Instagram API
        followers_response = requests.get(
            followers_url,
            params=followers_params,
            proxies=proxies,
            verify=False
        )
        followers_response.raise_for_status()
        pagination = followers_response.json()['pagination']

        for follower in followers_response.json()['data']:
            # Only store followers that are not already in db.
            if follower['id'] not in current_followers_ids:
                related_profile = Profile(
                    'instagram',
                    follower['id'],
                    follower['username'],
                    is_stub=True
                )

                db.add(related_profile)

                try:
                    db.commit()
                except IntegrityError:
                    db.rollback()
                    related_profile = db \
                            .query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==follower['id']) \
                            .one()

                related_profile.name = follower['full_name']
                profile.followers.append(related_profile)
                followers_results += 1
                worker.update_job(current=friends_results + followers_results)

                if followers_results == max_results:
                    break

        # If there are more results, set the cursor paramater, otherwise finish
        if 'next_cursor' in pagination:
            followers_params['cursor'] = pagination['next_cursor']
        else:
            break # No more results

    worker.finish_job()
    redis.publish('profile_relations', json.dumps({'id': id_}))
Exemple #4
0
def scrape_twitter_posts(id_, recent):
    """
    Fetch tweets for the user identified by id_.
    Checks tweets already stored in db, and will only fetch older or newer
    tweets depending on value of the boolean argument 'recent',
    e.g. recent=True will return recent tweets not already stored in the db.
    The number of tweets to fetch is configured in the Admin.
    """
    db = worker.get_session()
    max_results = get_config(db, 'max_posts_twitter', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException('Value of max_posts_twitter must be an integer')

    worker.start_job(total=max_results)
    redis = worker.get_redis()
    author = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    results = 0
    max_id = None
    more_results = True
    count = 200

    if author is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    # Get posts currently stored in db for this profile.
    post_query = db.query(Post) \
                        .filter(Post.author_id == id_) \
                        .order_by(Post.upstream_created.desc())

    url = 'https://api.twitter.com/1.1/statuses/user_timeline.json'
    params = {'count': count, 'user_id': author.upstream_id}

    if post_query.count() > 0:
        # Only fetch posts newer than those already stored in db
        if recent:
            since_id = post_query[0].upstream_id
            params['since_id'] = str(since_id)
        # Only fetch posts older than those already stored in db
        else:
            max_id = post_query[post_query.count() -1].upstream_id
            params['max_id'] = str(max_id)

    while more_results:
        response = requests.get(
            url,
            params=params,
            proxies=proxies,
            verify=False,
            headers=TWITTER_HEADERS,
        )
        response.raise_for_status()

        post_ids = list()

        tweets = response.json()
        if len(tweets) == 0:
            more_results = False

        if len(tweets) < count:
            more_results = False

        for tweet in tweets:
            # Twitter API result set includes the tweet with the max_id/since_id
            # so ignore it.
            if tweet['id_str'] != max_id:
                post = Post(
                    author,
                    tweet['id_str'],
                    dateutil.parser.parse(tweet['created_at']),
                    tweet['text']
                )

                if tweet['lang'] is not None:
                    post.language = tweet['lang']

                if tweet['coordinates'] is not None:
                    post.latitude, post.longitude = tweet['coordinates']

                place = tweet['place']

                if place is not None:
                    # Set longitude/latitude to the center the of bounding polygon.
                    total_lon = 0
                    total_lat = 0
                    num_coords = 0

                    for lon, lat in place['bounding_box']['coordinates'][0]:
                        total_lon += lon
                        total_lat += lat
                        num_coords += 1

                    post.longitude = total_lon / num_coords
                    post.latitude = total_lat / num_coords

                    # Set location to string identifying the place.
                    post.location = '{}, {}'.format(
                        place['full_name'],
                        place['country']
                    )

                db.add(post)
                db.flush()
                post_ids.append(post.id)
                # Set the max_id to the last tweet to get the next set of
                # results
                max_id = tweet['id_str']
                params['max_id'] = max_id
                results += 1
                worker.update_job(current=results)

                if results == max_results:
                    more_results = False
                    break


    db.commit()
    worker.finish_job()
    redis.publish('profile_posts', json.dumps({'id': id_}))
    app.queue.schedule_index_posts(post_ids)
Exemple #5
0
def scrape_instagram_posts(id_, recent):
    """
    Fetch instagram posts for the user identified by id_.
    Checks posts already stored in db, and will only fetch older or newer
    posts depending on value of the boolean argument 'recent',
    e.g. recent=True will return recent posts not already stored in the db.
    The number of posts to fetch is configured in the Admin.
    """
    redis = worker.get_redis()
    db = worker.get_session()
    author = db.query(Profile).filter(Profile.id == id_).first()
    proxies = _get_proxies(db)
    max_results = get_config(db, 'max_posts_instagram', required=True).value
    try:
        max_results = int(max_results)
    except:
        raise ScrapeException('Value of max_posts_instagram must be an integer')

    min_id = None
    results = 0
    params = {}

    if author is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    url = 'https://api.instagram.com/v1/users/{}/media/recent' \
          .format(author.upstream_id)

    # Get last post currently stored in db for this profile.
    post_query = db.query(Post) \
        .filter(Post.author_id == id_) \
        .order_by(Post.upstream_created.desc()) \

    if post_query.count() > 0:
        # Only fetch posts newer than those already stored in db
        if recent:
            min_id = post_query[0].upstream_id
            params['min_id'] = str(min_id)
        # Only fetch posts older than those already stored in db
        else:
            max_id = post_query[post_query.count() - 1].upstream_id
            params['max_id'] = str(max_id)

    worker.start_job(total=max_results)
    while results < max_results:
        response = requests.get(
            url,
            params=params,
            proxies=proxies,
            verify=False
        )

        response.raise_for_status()
        post_ids = list()
        response_json = response.json()['data']
        pagination = response.json()['pagination']

        # Instagram API result includes post with min_id so remove it
        response_json[:] = [d for d in response_json if d.get('id') != min_id]

        for gram in response_json:
            if gram['caption'] is not None:
                text = gram['caption']['text']
            else:
                text = None

            post = Post(
                author,
                gram['id'],
                datetime.fromtimestamp(int(gram['created_time'])),
                text
            )

            if gram['location'] is not None:
                if 'latitude' in gram['location']:
                    post.latitude = gram['location']['latitude']
                    post.longitude = gram['location']['longitude']

                if 'name' in gram['location']:
                    post.location = gram['location']['name']

                    if 'street_address' in gram['location']:
                        post.location += ' ' + gram['location']['street_address']

            if 'images' in gram:
                image_url = gram['images']['standard_resolution']['url']
                name = os.path.basename(urlparse(image_url).path)
                img_response = requests.get(image_url, verify=False)
                mime = img_response.headers['Content-type']
                image = img_response.content
                post.attachments.append(File(name, mime, image))

            db.add(post)
            db.flush()
            post_ids.append(post.id)
            worker.update_job(current=results)
            results += 1
            if results == max_results:
                break

        # If there are more results, set the max_id param, otherwise finish
        if 'next_max_id' in pagination:
            params['max_id'] = pagination['next_max_id']
        else:
            break

    db.commit()
    worker.finish_job()
    redis.publish('profile_posts', json.dumps({'id': id_}))
    app.queue.schedule_index_posts(post_ids)
Exemple #6
0
def scrape_twitter_relations(id_):
    '''
    Fetch friends and followers for the Twitter user identified by `id_`.
    The number of friends and followers to fetch is configured in Admin.
    '''
    redis = worker.get_redis()
    db = worker.get_session()
    profile = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    #max_results = _get_max_relations(db)['twitter']
    max_results = get_config(db, 'max_relations_twitter', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException(
            'Value of max_relations_twitter must be an integer'
        )

    friends_results = 0
    friends_ids = []
    followers_results = 0
    followers_ids = []
    friends_cursor = -1
    followers_cursor = -1

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    params = {
        'count': 5000,
        'user_id': profile.upstream_id,
        'stringify_ids': True,
    }

    # Get friends currently stored in db for this profile.
    friends_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.friend_id == Profile.id)
            ) \
            .filter(profile_join_self.c.follower_id == id_)
    current_friends_ids = [friend.upstream_id for friend in friends_query]


    # Get followers currently stored in db for this profile.
    followers_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.follower_id == Profile.id)
            ) \
            .filter(profile_join_self.c.friend_id == id_)
    current_followers_ids = [follower.upstream_id for follower in followers_query]

    ## Get friend IDs.
    friends_url = 'https://api.twitter.com/1.1/friends/ids.json'
    params['cursor'] = friends_cursor

    while friends_results < max_results:
        friends_response = requests.get(
            friends_url,
            params=params,
            proxies=proxies,
            verify=False
        )
        friends_response.raise_for_status()

        # Ignore friends already in the db
        for friend_id in friends_response.json()['ids']:
            if friend_id not in current_friends_ids:
                friends_ids.append(friend_id)
                friends_results += 1
                if friends_results == max_results:
                    break

        friends_cursor = friends_response.json()['next_cursor']

        if friends_cursor == 0:
            break # No more results
        else:
            params['cursor'] = friends_cursor

    # Get follower IDs.
    followers_url = 'https://api.twitter.com/1.1/followers/ids.json'
    params['cursor'] = followers_cursor

    while followers_results < max_results:
        followers_response = requests.get(
            followers_url,
            params=params,
            proxies=proxies,
            verify=False
        )
        followers_response.raise_for_status()

        # Ignore followers already in the db
        for follower_id in followers_response.json()['ids']:
            if follower_id not in current_followers_ids:
                followers_ids.append(follower_id)
                followers_results += 1
                if followers_results == max_results:
                    break

        followers_cursor = followers_response.json()['next_cursor']

        if followers_cursor == 0:
            break # No more results
        else:
            params['cursor'] = followers_cursor

    # Get username for each of the friend/follower IDs and create
    # a relationship in QuickPin.
    user_ids = [(uid, 'friend') for uid in friends_ids] + \
               [(uid, 'follower') for uid in followers_ids]
    worker.start_job(total=len(user_ids))
    chunk_size = 100
    for chunk_start in range(0, len(user_ids), chunk_size):
        chunk_end = chunk_start + chunk_size
        chunk = user_ids[chunk_start:chunk_end]
        chunk_lookup = {id_:relation for id_,relation in chunk}

        lookup_url = 'https://api.twitter.com/1.1/users/lookup.json'
        lookup_response = requests.post(
            lookup_url,
            proxies=_get_proxies(db),
            verify=False,
            data={'user_id': ','.join(chunk_lookup.keys())}
        )
        lookup_response.raise_for_status()
        relations = lookup_response.json()

        for related_dict in relations:
            uid = related_dict['id_str']
            username = related_dict['screen_name']
            related_profile = Profile('twitter', uid, username, is_stub=True)
            db.add(related_profile)

            try:
                db.commit()
            except IntegrityError:
                # Already exists: use the existing profile.
                db.rollback()
                related_profile = db \
                    .query(Profile) \
                    .filter(Profile.site=='twitter') \
                    .filter(Profile.upstream_id==uid) \
                    .one()

            _twitter_populate_profile(related_dict, related_profile)
            relation = chunk_lookup[uid]

            if relation == 'friend':
                profile.friends.append(related_profile)
            else: # relation == 'follower':
                profile.followers.append(related_profile)

            db.commit()

        worker.update_job(current=chunk_end)

    db.commit()
    worker.finish_job()
    redis.publish('profile_relations', json.dumps({'id': id_}))
Exemple #7
0
def scrape_twitter_posts(id_, recent):
    '''
    Fetch tweets for the user identified by id_.
    Checks tweets already stored in db, and will only fetch older or newer
    tweets depending on value of the boolean argument 'recent',
    e.g. recent=True will return recent tweets not already stored in the db.
    The number of tweets to fetch is configured in the Admin.
    '''
    db = worker.get_session()
    #max_results = _get_max_posts(db)['twitter']
    max_results = get_config(db, 'max_posts_twitter', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException('Value of max_posts_twitter must be an integer')

    worker.start_job(total=max_results)
    redis = worker.get_redis()
    author = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    results = 0
    max_id = None
    more_results = True
    count = 200

    if author is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    # Get posts currently stored in db for this profile.
    post_query = db.query(Post) \
                        .filter(Post.author_id == id_) \
                        .order_by(Post.upstream_created.desc())

    url = 'https://api.twitter.com/1.1/statuses/user_timeline.json'
    params = {'count': count, 'user_id': author.upstream_id}

    if post_query.count() > 0:
        # Only fetch posts newer than those already stored in db
        if recent:
            since_id = post_query[0].upstream_id
            params['since_id'] = str(since_id)
        # Only fetch posts older than those already stored in db
        else:
            max_id = post_query[post_query.count() -1].upstream_id
            params['max_id'] = str(max_id)

    while more_results:
        response = requests.get(
            url,
            params=params,
            proxies=proxies,
            verify=False
        )
        response.raise_for_status()

        post_ids = list()

        tweets = response.json()
        if len(tweets) == 0:
            more_results = False

        if len(tweets) < count:
            more_results = False

        for tweet in tweets:
            # Twitter API result set includes the tweet with the max_id/since_id
            # so ignore it.
            if tweet['id_str'] != max_id:
                post = Post(
                    author,
                    tweet['id_str'],
                    dateutil.parser.parse(tweet['created_at']),
                    tweet['text']
                )

                if tweet['lang'] is not None:
                    post.language = tweet['lang']

                if tweet['coordinates'] is not None:
                    post.latitude, post.longitude = tweet['coordinates']

                place = tweet['place']

                if place is not None:
                    # Set longitude/latitude to the center the of bounding polygon.
                    total_lon = 0
                    total_lat = 0
                    num_coords = 0

                    for lon, lat in place['bounding_box']['coordinates'][0]:
                        total_lon += lon
                        total_lat += lat
                        num_coords += 1

                    post.longitude = total_lon / num_coords
                    post.latitude = total_lat / num_coords

                    # Set location to string identifying the place.
                    post.location = '{}, {}'.format(
                        place['full_name'],
                        place['country']
                    )

                db.add(post)
                db.flush()
                post_ids.append(post.id)
                # Set the max_id to the last tweet to get the next set of
                # results
                max_id = tweet['id_str']
                params['max_id'] = max_id
                results += 1
                worker.update_job(current=results)

                if results == max_results:
                    more_results = False
                    break


    db.commit()
    worker.finish_job()
    redis.publish('profile_posts', json.dumps({'id': id_}))
    app.queue.schedule_index_posts(post_ids)
Exemple #8
0
def scrape_instagram_relations(id_):
    '''
    Fetch friends and followers for the Instagram user identified by `id_`.
    The number of friends and followers to fetch is configured in Admin.
    '''
    redis = worker.get_redis()
    db = worker.get_session()
    profile = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    friends_results = 0
    followers_results = 0
    #max_results = _get_max_relations(db)['instagram']
    max_results = get_config(db, 'max_relations_instagram', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException(
            'Value of max_relations_instagram must be an integer'
        )

    friends_params = {}
    followers_params = {}
    total_results = max_results*2

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    # Get friends currently stored in db for this profile.
    friends_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.friend_id == Profile.id)
            ) \
            .filter(profile_join_self.c.follower_id == id_)
    current_friends_ids = [friend.upstream_id for friend in friends_query]

    # Get followers currently stored in db for this profile.
    followers_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.follower_id == Profile.id)
            ) \
            .filter(profile_join_self.c.friend_id == id_)
    current_followers_ids = [follower.upstream_id for follower in followers_query]

    worker.start_job(total=total_results)

    # Get friend IDs.
    friends_url = 'https://api.instagram.com/v1/users/{}/follows' \
                  .format(profile.upstream_id)

    while friends_results < max_results:
        # Get friends from Instagram API
        friends_response = requests.get(
            friends_url,
            params=friends_params,
            proxies=proxies,
            verify=False
        )
        friends_response.raise_for_status()
        pagination = friends_response.json()['pagination']

        for friend in friends_response.json()['data']:
            # Only store friends that are not already in db.
            if friend['id'] not in current_friends_ids:
                related_profile = Profile(
                    'instagram',
                    friend['id'],
                    friend['username'],
                    is_stub=True
                )

                db.add(related_profile)

                try:
                    db.commit()
                except IntegrityError:
                    db.rollback()
                    related_profile = db \
                            .query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==friend['id']) \
                            .one()

                related_profile.name = friend['full_name']
                profile.friends.append(related_profile)
                friends_results += 1
                worker.update_job(current=friends_results)

                if friends_results == max_results:
                    break

        # If there are more results, set the cursor paramater, otherwise finish
        if 'next_cursor' in pagination:
            friends_params['cursor'] = pagination['next_cursor']
        else:
            break # No more results

    # Get follower IDs.
    followers_url = 'https://api.instagram.com/v1/users/{}/followed-by' \
                    .format(profile.upstream_id)

    # Get followers from Instagram API
    while followers_results < max_results:
        # Get friends from Instagram API
        followers_response = requests.get(
            followers_url,
            params=followers_params,
            proxies=proxies,
            verify=False
        )
        followers_response.raise_for_status()
        pagination = followers_response.json()['pagination']

        for follower in followers_response.json()['data']:
            # Only store followers that are not already in db.
            if follower['id'] not in current_followers_ids:
                related_profile = Profile(
                    'instagram',
                    follower['id'],
                    follower['username'],
                    is_stub=True
                )

                db.add(related_profile)

                try:
                    db.commit()
                except IntegrityError:
                    db.rollback()
                    related_profile = db \
                            .query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==follower['id']) \
                            .one()

                related_profile.name = follower['full_name']
                profile.followers.append(related_profile)
                followers_results += 1
                worker.update_job(current=friends_results + followers_results)

                if followers_results == max_results:
                    break

        # If there are more results, set the cursor paramater, otherwise finish
        if 'next_cursor' in pagination:
            followers_params['cursor'] = pagination['next_cursor']
        else:
            break # No more results

    worker.finish_job()
    redis.publish('profile_relations', json.dumps({'id': id_}))
Exemple #9
0
def scrape_instagram_posts(id_, recent):
    '''
    Fetch instagram posts for the user identified by id_.
    Checks posts already stored in db, and will only fetch older or newer
    posts depending on value of the boolean argument 'recent',
    e.g. recent=True will return recent posts not already stored in the db.
    The number of posts to fetch is configured in the Admin.
    '''
    redis = worker.get_redis()
    db = worker.get_session()
    author = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    max_results = get_config(db, 'max_posts_instagram', required=True).value
    try:
        max_results = int(max_results)
    except:
        raise ScrapeException('Value of max_posts_instagram must be an integer')

    min_id = None
    more_results = True
    results = 0
    params = {}

    if author is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    url = 'https://api.instagram.com/v1/users/{}/media/recent' \
          .format(author.upstream_id)

    # Get last post currently stored in db for this profile.
    post_query = db.query(Post) \
                        .filter(Post.author_id == id_) \
                        .order_by(Post.upstream_created.desc()) \

    if post_query.count() > 0:
        # Only fetch posts newer than those already stored in db
        if recent:
            min_id = post_query[0].upstream_id
            params['min_id'] = str(min_id)
        # Only fetch posts older than those already stored in db
        else:
            max_id = post_query[post_query.count() -1].upstream_id
            params['max_id'] = str(max_id)

    worker.start_job(total=max_results)
    logging.warning('WORKER max results: {}'.format(max_results))
    while results < max_results:
        response = requests.get(
            url,
            params=params,
            proxies=proxies,
            verify=False
        )

        response.raise_for_status()
        post_ids = list()
        response_json = response.json()['data']
        pagination = response.json()['pagination']

        # Instagram API result includes post with min_id so remove it
        response_json[:] = [d for d in response_json if d.get('id') != min_id]

        for gram in response_json:
            if gram['caption'] is not None:
                text = gram['caption']['text']
            else:
                text = None

            post = Post(
                author,
                gram['id'],
                datetime.fromtimestamp(int(gram['created_time'])),
                text
            )

            if gram['location'] is not None:
                if 'latitude' in gram['location']:
                    post.latitude = gram['location']['latitude']
                    post.longitude = gram['location']['longitude']

                if 'name' in gram['location']:
                    post.location = gram['location']['name']

                    if 'street_address' in gram['location']:
                        post.location += ' ' + gram['location']['street_address']

            if 'images' in gram:
                image_url = gram['images']['standard_resolution']['url']
                name = os.path.basename(urlparse(image_url).path)
                img_response = requests.get(image_url, verify=False)
                mime = img_response.headers['Content-type']
                image = img_response.content
                post.attachments.append(File(name, mime, image))

            db.add(post)
            db.flush()
            post_ids.append(post.id)
            worker.update_job(current=results)
            results += 1
            if results == max_results:
                break

        # If there are more results, set the max_id param, otherwise finish
        if 'next_max_id' in pagination:
            params['max_id'] = pagination['next_max_id']
        else:
            break

    db.commit()
    worker.finish_job()
    redis.publish('profile_posts', json.dumps({'id': id_}))
    app.queue.schedule_index_posts(post_ids)