def sleep_determinate(period): ''' Sleep for a specified period of time with progress updates.''' total_sleep = 0 worker.start_job(total=int(math.ceil(period))) while total_sleep < period: time.sleep(1) total_sleep += 1 worker.update_job(current=total_sleep) worker.finish_job()
def scrape_twitter_relations(id_): """ Fetch friends and followers for the Twitter user identified by `id_`. The number of friends and followers to fetch is configured in Admin. """ redis = worker.get_redis() db = worker.get_session() profile = db.query(Profile).filter(Profile.id==id_).first() proxies = _get_proxies(db) max_results = get_config(db, 'max_relations_twitter', required=True).value try: max_results = int(max_results) except: raise ScrapeException( 'Value of max_relations_twitter must be an integer' ) friends_results = 0 friends_ids = [] followers_results = 0 followers_ids = [] friends_cursor = -1 followers_cursor = -1 if profile is None: raise ValueError('No profile exists with id={}'.format(id_)) params = { 'count': 5000, 'user_id': profile.upstream_id, 'stringify_ids': True, } # Get friends currently stored in db for this profile. friends_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.friend_id == Profile.id) ) \ .filter(profile_join_self.c.follower_id == id_) current_friends_ids = [friend.upstream_id for friend in friends_query] # Get followers currently stored in db for this profile. followers_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.follower_id == Profile.id) ) \ .filter(profile_join_self.c.friend_id == id_) current_followers_ids = [follower.upstream_id for follower in followers_query] ## Get friend IDs. friends_url = 'https://api.twitter.com/1.1/friends/ids.json' params['cursor'] = friends_cursor while friends_results < max_results: friends_response = requests.get( friends_url, params=params, proxies=proxies, verify=False, headers=TWITTER_HEADERS ) friends_response.raise_for_status() # Ignore friends already in the db for friend_id in friends_response.json()['ids']: if friend_id not in current_friends_ids: friends_ids.append(friend_id) friends_results += 1 if friends_results == max_results: break friends_cursor = friends_response.json()['next_cursor'] if friends_cursor == 0: break # No more results else: params['cursor'] = friends_cursor # Get follower IDs. followers_url = 'https://api.twitter.com/1.1/followers/ids.json' params['cursor'] = followers_cursor while followers_results < max_results: followers_response = requests.get( followers_url, params=params, proxies=proxies, verify=False, headers=TWITTER_HEADERS, ) followers_response.raise_for_status() # Ignore followers already in the db for follower_id in followers_response.json()['ids']: if follower_id not in current_followers_ids: followers_ids.append(follower_id) followers_results += 1 if followers_results == max_results: break followers_cursor = followers_response.json()['next_cursor'] if followers_cursor == 0: break # No more results else: params['cursor'] = followers_cursor # Get username for each of the friend/follower IDs and create # a relationship in QuickPin. user_ids = [(uid, 'friend') for uid in friends_ids] + \ [(uid, 'follower') for uid in followers_ids] worker.start_job(total=len(user_ids)) chunk_size = 100 for chunk_start in range(0, len(user_ids), chunk_size): chunk_end = chunk_start + chunk_size chunk = user_ids[chunk_start:chunk_end] chunk_lookup = {id_:relation for id_,relation in chunk} lookup_url = 'https://api.twitter.com/1.1/users/lookup.json' lookup_response = requests.post( lookup_url, proxies=_get_proxies(db), verify=False, headers=TWITTER_HEADERS, data={'user_id': ','.join(chunk_lookup.keys())} ) lookup_response.raise_for_status() relations = lookup_response.json() for related_dict in relations: uid = related_dict['id_str'] username = related_dict['screen_name'] related_profile = Profile('twitter', uid, username, is_stub=True) db.add(related_profile) try: db.commit() except IntegrityError: # Already exists: use the existing profile. db.rollback() related_profile = db \ .query(Profile) \ .filter(Profile.site=='twitter') \ .filter(Profile.upstream_id==uid) \ .one() _twitter_populate_profile(related_dict, related_profile) relation = chunk_lookup[uid] if relation == 'friend': profile.friends.append(related_profile) else: # relation == 'follower': profile.followers.append(related_profile) db.commit() worker.update_job(current=chunk_end) db.commit() worker.finish_job() redis.publish('profile_relations', json.dumps({'id': id_}))
def scrape_instagram_relations(id_): """ Fetch friends and followers for the Instagram user identified by `id_`. The number of friends and followers to fetch is configured in Admin. """ redis = worker.get_redis() db = worker.get_session() profile = db.query(Profile).filter(Profile.id==id_).first() proxies = _get_proxies(db) friends_results = 0 followers_results = 0 max_results = get_config(db, 'max_relations_instagram', required=True).value try: max_results = int(max_results) except: raise ScrapeException( 'Value of max_relations_instagram must be an integer' ) friends_params = {} followers_params = {} total_results = max_results*2 if profile is None: raise ValueError('No profile exists with id={}'.format(id_)) # Get friends currently stored in db for this profile. friends_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.friend_id == Profile.id) ) \ .filter(profile_join_self.c.follower_id == id_) current_friends_ids = [friend.upstream_id for friend in friends_query] # Get followers currently stored in db for this profile. followers_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.follower_id == Profile.id) ) \ .filter(profile_join_self.c.friend_id == id_) current_followers_ids = [follower.upstream_id for follower in followers_query] worker.start_job(total=total_results) # Get friend IDs. friends_url = 'https://api.instagram.com/v1/users/{}/follows' \ .format(profile.upstream_id) while friends_results < max_results: # Get friends from Instagram API friends_response = requests.get( friends_url, params=friends_params, proxies=proxies, verify=False ) friends_response.raise_for_status() pagination = friends_response.json()['pagination'] for friend in friends_response.json()['data']: # Only store friends that are not already in db. if friend['id'] not in current_friends_ids: related_profile = Profile( 'instagram', friend['id'], friend['username'], is_stub=True ) db.add(related_profile) try: db.commit() except IntegrityError: db.rollback() related_profile = db \ .query(Profile) \ .filter(Profile.site=='instagram') \ .filter(Profile.upstream_id==friend['id']) \ .one() related_profile.name = friend['full_name'] profile.friends.append(related_profile) friends_results += 1 worker.update_job(current=friends_results) if friends_results == max_results: break # If there are more results, set the cursor paramater, otherwise finish if 'next_cursor' in pagination: friends_params['cursor'] = pagination['next_cursor'] else: break # No more results # Get follower IDs. followers_url = 'https://api.instagram.com/v1/users/{}/followed-by' \ .format(profile.upstream_id) # Get followers from Instagram API while followers_results < max_results: # Get friends from Instagram API followers_response = requests.get( followers_url, params=followers_params, proxies=proxies, verify=False ) followers_response.raise_for_status() pagination = followers_response.json()['pagination'] for follower in followers_response.json()['data']: # Only store followers that are not already in db. if follower['id'] not in current_followers_ids: related_profile = Profile( 'instagram', follower['id'], follower['username'], is_stub=True ) db.add(related_profile) try: db.commit() except IntegrityError: db.rollback() related_profile = db \ .query(Profile) \ .filter(Profile.site=='instagram') \ .filter(Profile.upstream_id==follower['id']) \ .one() related_profile.name = follower['full_name'] profile.followers.append(related_profile) followers_results += 1 worker.update_job(current=friends_results + followers_results) if followers_results == max_results: break # If there are more results, set the cursor paramater, otherwise finish if 'next_cursor' in pagination: followers_params['cursor'] = pagination['next_cursor'] else: break # No more results worker.finish_job() redis.publish('profile_relations', json.dumps({'id': id_}))
def scrape_twitter_posts(id_, recent): """ Fetch tweets for the user identified by id_. Checks tweets already stored in db, and will only fetch older or newer tweets depending on value of the boolean argument 'recent', e.g. recent=True will return recent tweets not already stored in the db. The number of tweets to fetch is configured in the Admin. """ db = worker.get_session() max_results = get_config(db, 'max_posts_twitter', required=True).value try: max_results = int(max_results) except: raise ScrapeException('Value of max_posts_twitter must be an integer') worker.start_job(total=max_results) redis = worker.get_redis() author = db.query(Profile).filter(Profile.id==id_).first() proxies = _get_proxies(db) results = 0 max_id = None more_results = True count = 200 if author is None: raise ValueError('No profile exists with id={}'.format(id_)) # Get posts currently stored in db for this profile. post_query = db.query(Post) \ .filter(Post.author_id == id_) \ .order_by(Post.upstream_created.desc()) url = 'https://api.twitter.com/1.1/statuses/user_timeline.json' params = {'count': count, 'user_id': author.upstream_id} if post_query.count() > 0: # Only fetch posts newer than those already stored in db if recent: since_id = post_query[0].upstream_id params['since_id'] = str(since_id) # Only fetch posts older than those already stored in db else: max_id = post_query[post_query.count() -1].upstream_id params['max_id'] = str(max_id) while more_results: response = requests.get( url, params=params, proxies=proxies, verify=False, headers=TWITTER_HEADERS, ) response.raise_for_status() post_ids = list() tweets = response.json() if len(tweets) == 0: more_results = False if len(tweets) < count: more_results = False for tweet in tweets: # Twitter API result set includes the tweet with the max_id/since_id # so ignore it. if tweet['id_str'] != max_id: post = Post( author, tweet['id_str'], dateutil.parser.parse(tweet['created_at']), tweet['text'] ) if tweet['lang'] is not None: post.language = tweet['lang'] if tweet['coordinates'] is not None: post.latitude, post.longitude = tweet['coordinates'] place = tweet['place'] if place is not None: # Set longitude/latitude to the center the of bounding polygon. total_lon = 0 total_lat = 0 num_coords = 0 for lon, lat in place['bounding_box']['coordinates'][0]: total_lon += lon total_lat += lat num_coords += 1 post.longitude = total_lon / num_coords post.latitude = total_lat / num_coords # Set location to string identifying the place. post.location = '{}, {}'.format( place['full_name'], place['country'] ) db.add(post) db.flush() post_ids.append(post.id) # Set the max_id to the last tweet to get the next set of # results max_id = tweet['id_str'] params['max_id'] = max_id results += 1 worker.update_job(current=results) if results == max_results: more_results = False break db.commit() worker.finish_job() redis.publish('profile_posts', json.dumps({'id': id_})) app.queue.schedule_index_posts(post_ids)
def scrape_instagram_posts(id_, recent): """ Fetch instagram posts for the user identified by id_. Checks posts already stored in db, and will only fetch older or newer posts depending on value of the boolean argument 'recent', e.g. recent=True will return recent posts not already stored in the db. The number of posts to fetch is configured in the Admin. """ redis = worker.get_redis() db = worker.get_session() author = db.query(Profile).filter(Profile.id == id_).first() proxies = _get_proxies(db) max_results = get_config(db, 'max_posts_instagram', required=True).value try: max_results = int(max_results) except: raise ScrapeException('Value of max_posts_instagram must be an integer') min_id = None results = 0 params = {} if author is None: raise ValueError('No profile exists with id={}'.format(id_)) url = 'https://api.instagram.com/v1/users/{}/media/recent' \ .format(author.upstream_id) # Get last post currently stored in db for this profile. post_query = db.query(Post) \ .filter(Post.author_id == id_) \ .order_by(Post.upstream_created.desc()) \ if post_query.count() > 0: # Only fetch posts newer than those already stored in db if recent: min_id = post_query[0].upstream_id params['min_id'] = str(min_id) # Only fetch posts older than those already stored in db else: max_id = post_query[post_query.count() - 1].upstream_id params['max_id'] = str(max_id) worker.start_job(total=max_results) while results < max_results: response = requests.get( url, params=params, proxies=proxies, verify=False ) response.raise_for_status() post_ids = list() response_json = response.json()['data'] pagination = response.json()['pagination'] # Instagram API result includes post with min_id so remove it response_json[:] = [d for d in response_json if d.get('id') != min_id] for gram in response_json: if gram['caption'] is not None: text = gram['caption']['text'] else: text = None post = Post( author, gram['id'], datetime.fromtimestamp(int(gram['created_time'])), text ) if gram['location'] is not None: if 'latitude' in gram['location']: post.latitude = gram['location']['latitude'] post.longitude = gram['location']['longitude'] if 'name' in gram['location']: post.location = gram['location']['name'] if 'street_address' in gram['location']: post.location += ' ' + gram['location']['street_address'] if 'images' in gram: image_url = gram['images']['standard_resolution']['url'] name = os.path.basename(urlparse(image_url).path) img_response = requests.get(image_url, verify=False) mime = img_response.headers['Content-type'] image = img_response.content post.attachments.append(File(name, mime, image)) db.add(post) db.flush() post_ids.append(post.id) worker.update_job(current=results) results += 1 if results == max_results: break # If there are more results, set the max_id param, otherwise finish if 'next_max_id' in pagination: params['max_id'] = pagination['next_max_id'] else: break db.commit() worker.finish_job() redis.publish('profile_posts', json.dumps({'id': id_})) app.queue.schedule_index_posts(post_ids)
def scrape_twitter_relations(id_): ''' Fetch friends and followers for the Twitter user identified by `id_`. The number of friends and followers to fetch is configured in Admin. ''' redis = worker.get_redis() db = worker.get_session() profile = db.query(Profile).filter(Profile.id==id_).first() proxies = _get_proxies(db) #max_results = _get_max_relations(db)['twitter'] max_results = get_config(db, 'max_relations_twitter', required=True).value try: max_results = int(max_results) except: raise ScrapeException( 'Value of max_relations_twitter must be an integer' ) friends_results = 0 friends_ids = [] followers_results = 0 followers_ids = [] friends_cursor = -1 followers_cursor = -1 if profile is None: raise ValueError('No profile exists with id={}'.format(id_)) params = { 'count': 5000, 'user_id': profile.upstream_id, 'stringify_ids': True, } # Get friends currently stored in db for this profile. friends_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.friend_id == Profile.id) ) \ .filter(profile_join_self.c.follower_id == id_) current_friends_ids = [friend.upstream_id for friend in friends_query] # Get followers currently stored in db for this profile. followers_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.follower_id == Profile.id) ) \ .filter(profile_join_self.c.friend_id == id_) current_followers_ids = [follower.upstream_id for follower in followers_query] ## Get friend IDs. friends_url = 'https://api.twitter.com/1.1/friends/ids.json' params['cursor'] = friends_cursor while friends_results < max_results: friends_response = requests.get( friends_url, params=params, proxies=proxies, verify=False ) friends_response.raise_for_status() # Ignore friends already in the db for friend_id in friends_response.json()['ids']: if friend_id not in current_friends_ids: friends_ids.append(friend_id) friends_results += 1 if friends_results == max_results: break friends_cursor = friends_response.json()['next_cursor'] if friends_cursor == 0: break # No more results else: params['cursor'] = friends_cursor # Get follower IDs. followers_url = 'https://api.twitter.com/1.1/followers/ids.json' params['cursor'] = followers_cursor while followers_results < max_results: followers_response = requests.get( followers_url, params=params, proxies=proxies, verify=False ) followers_response.raise_for_status() # Ignore followers already in the db for follower_id in followers_response.json()['ids']: if follower_id not in current_followers_ids: followers_ids.append(follower_id) followers_results += 1 if followers_results == max_results: break followers_cursor = followers_response.json()['next_cursor'] if followers_cursor == 0: break # No more results else: params['cursor'] = followers_cursor # Get username for each of the friend/follower IDs and create # a relationship in QuickPin. user_ids = [(uid, 'friend') for uid in friends_ids] + \ [(uid, 'follower') for uid in followers_ids] worker.start_job(total=len(user_ids)) chunk_size = 100 for chunk_start in range(0, len(user_ids), chunk_size): chunk_end = chunk_start + chunk_size chunk = user_ids[chunk_start:chunk_end] chunk_lookup = {id_:relation for id_,relation in chunk} lookup_url = 'https://api.twitter.com/1.1/users/lookup.json' lookup_response = requests.post( lookup_url, proxies=_get_proxies(db), verify=False, data={'user_id': ','.join(chunk_lookup.keys())} ) lookup_response.raise_for_status() relations = lookup_response.json() for related_dict in relations: uid = related_dict['id_str'] username = related_dict['screen_name'] related_profile = Profile('twitter', uid, username, is_stub=True) db.add(related_profile) try: db.commit() except IntegrityError: # Already exists: use the existing profile. db.rollback() related_profile = db \ .query(Profile) \ .filter(Profile.site=='twitter') \ .filter(Profile.upstream_id==uid) \ .one() _twitter_populate_profile(related_dict, related_profile) relation = chunk_lookup[uid] if relation == 'friend': profile.friends.append(related_profile) else: # relation == 'follower': profile.followers.append(related_profile) db.commit() worker.update_job(current=chunk_end) db.commit() worker.finish_job() redis.publish('profile_relations', json.dumps({'id': id_}))
def scrape_twitter_posts(id_, recent): ''' Fetch tweets for the user identified by id_. Checks tweets already stored in db, and will only fetch older or newer tweets depending on value of the boolean argument 'recent', e.g. recent=True will return recent tweets not already stored in the db. The number of tweets to fetch is configured in the Admin. ''' db = worker.get_session() #max_results = _get_max_posts(db)['twitter'] max_results = get_config(db, 'max_posts_twitter', required=True).value try: max_results = int(max_results) except: raise ScrapeException('Value of max_posts_twitter must be an integer') worker.start_job(total=max_results) redis = worker.get_redis() author = db.query(Profile).filter(Profile.id==id_).first() proxies = _get_proxies(db) results = 0 max_id = None more_results = True count = 200 if author is None: raise ValueError('No profile exists with id={}'.format(id_)) # Get posts currently stored in db for this profile. post_query = db.query(Post) \ .filter(Post.author_id == id_) \ .order_by(Post.upstream_created.desc()) url = 'https://api.twitter.com/1.1/statuses/user_timeline.json' params = {'count': count, 'user_id': author.upstream_id} if post_query.count() > 0: # Only fetch posts newer than those already stored in db if recent: since_id = post_query[0].upstream_id params['since_id'] = str(since_id) # Only fetch posts older than those already stored in db else: max_id = post_query[post_query.count() -1].upstream_id params['max_id'] = str(max_id) while more_results: response = requests.get( url, params=params, proxies=proxies, verify=False ) response.raise_for_status() post_ids = list() tweets = response.json() if len(tweets) == 0: more_results = False if len(tweets) < count: more_results = False for tweet in tweets: # Twitter API result set includes the tweet with the max_id/since_id # so ignore it. if tweet['id_str'] != max_id: post = Post( author, tweet['id_str'], dateutil.parser.parse(tweet['created_at']), tweet['text'] ) if tweet['lang'] is not None: post.language = tweet['lang'] if tweet['coordinates'] is not None: post.latitude, post.longitude = tweet['coordinates'] place = tweet['place'] if place is not None: # Set longitude/latitude to the center the of bounding polygon. total_lon = 0 total_lat = 0 num_coords = 0 for lon, lat in place['bounding_box']['coordinates'][0]: total_lon += lon total_lat += lat num_coords += 1 post.longitude = total_lon / num_coords post.latitude = total_lat / num_coords # Set location to string identifying the place. post.location = '{}, {}'.format( place['full_name'], place['country'] ) db.add(post) db.flush() post_ids.append(post.id) # Set the max_id to the last tweet to get the next set of # results max_id = tweet['id_str'] params['max_id'] = max_id results += 1 worker.update_job(current=results) if results == max_results: more_results = False break db.commit() worker.finish_job() redis.publish('profile_posts', json.dumps({'id': id_})) app.queue.schedule_index_posts(post_ids)
def scrape_instagram_relations(id_): ''' Fetch friends and followers for the Instagram user identified by `id_`. The number of friends and followers to fetch is configured in Admin. ''' redis = worker.get_redis() db = worker.get_session() profile = db.query(Profile).filter(Profile.id==id_).first() proxies = _get_proxies(db) friends_results = 0 followers_results = 0 #max_results = _get_max_relations(db)['instagram'] max_results = get_config(db, 'max_relations_instagram', required=True).value try: max_results = int(max_results) except: raise ScrapeException( 'Value of max_relations_instagram must be an integer' ) friends_params = {} followers_params = {} total_results = max_results*2 if profile is None: raise ValueError('No profile exists with id={}'.format(id_)) # Get friends currently stored in db for this profile. friends_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.friend_id == Profile.id) ) \ .filter(profile_join_self.c.follower_id == id_) current_friends_ids = [friend.upstream_id for friend in friends_query] # Get followers currently stored in db for this profile. followers_query = \ db.query(Profile.upstream_id) \ .join(\ profile_join_self, \ (profile_join_self.c.follower_id == Profile.id) ) \ .filter(profile_join_self.c.friend_id == id_) current_followers_ids = [follower.upstream_id for follower in followers_query] worker.start_job(total=total_results) # Get friend IDs. friends_url = 'https://api.instagram.com/v1/users/{}/follows' \ .format(profile.upstream_id) while friends_results < max_results: # Get friends from Instagram API friends_response = requests.get( friends_url, params=friends_params, proxies=proxies, verify=False ) friends_response.raise_for_status() pagination = friends_response.json()['pagination'] for friend in friends_response.json()['data']: # Only store friends that are not already in db. if friend['id'] not in current_friends_ids: related_profile = Profile( 'instagram', friend['id'], friend['username'], is_stub=True ) db.add(related_profile) try: db.commit() except IntegrityError: db.rollback() related_profile = db \ .query(Profile) \ .filter(Profile.site=='instagram') \ .filter(Profile.upstream_id==friend['id']) \ .one() related_profile.name = friend['full_name'] profile.friends.append(related_profile) friends_results += 1 worker.update_job(current=friends_results) if friends_results == max_results: break # If there are more results, set the cursor paramater, otherwise finish if 'next_cursor' in pagination: friends_params['cursor'] = pagination['next_cursor'] else: break # No more results # Get follower IDs. followers_url = 'https://api.instagram.com/v1/users/{}/followed-by' \ .format(profile.upstream_id) # Get followers from Instagram API while followers_results < max_results: # Get friends from Instagram API followers_response = requests.get( followers_url, params=followers_params, proxies=proxies, verify=False ) followers_response.raise_for_status() pagination = followers_response.json()['pagination'] for follower in followers_response.json()['data']: # Only store followers that are not already in db. if follower['id'] not in current_followers_ids: related_profile = Profile( 'instagram', follower['id'], follower['username'], is_stub=True ) db.add(related_profile) try: db.commit() except IntegrityError: db.rollback() related_profile = db \ .query(Profile) \ .filter(Profile.site=='instagram') \ .filter(Profile.upstream_id==follower['id']) \ .one() related_profile.name = follower['full_name'] profile.followers.append(related_profile) followers_results += 1 worker.update_job(current=friends_results + followers_results) if followers_results == max_results: break # If there are more results, set the cursor paramater, otherwise finish if 'next_cursor' in pagination: followers_params['cursor'] = pagination['next_cursor'] else: break # No more results worker.finish_job() redis.publish('profile_relations', json.dumps({'id': id_}))
def scrape_instagram_posts(id_, recent): ''' Fetch instagram posts for the user identified by id_. Checks posts already stored in db, and will only fetch older or newer posts depending on value of the boolean argument 'recent', e.g. recent=True will return recent posts not already stored in the db. The number of posts to fetch is configured in the Admin. ''' redis = worker.get_redis() db = worker.get_session() author = db.query(Profile).filter(Profile.id==id_).first() proxies = _get_proxies(db) max_results = get_config(db, 'max_posts_instagram', required=True).value try: max_results = int(max_results) except: raise ScrapeException('Value of max_posts_instagram must be an integer') min_id = None more_results = True results = 0 params = {} if author is None: raise ValueError('No profile exists with id={}'.format(id_)) url = 'https://api.instagram.com/v1/users/{}/media/recent' \ .format(author.upstream_id) # Get last post currently stored in db for this profile. post_query = db.query(Post) \ .filter(Post.author_id == id_) \ .order_by(Post.upstream_created.desc()) \ if post_query.count() > 0: # Only fetch posts newer than those already stored in db if recent: min_id = post_query[0].upstream_id params['min_id'] = str(min_id) # Only fetch posts older than those already stored in db else: max_id = post_query[post_query.count() -1].upstream_id params['max_id'] = str(max_id) worker.start_job(total=max_results) logging.warning('WORKER max results: {}'.format(max_results)) while results < max_results: response = requests.get( url, params=params, proxies=proxies, verify=False ) response.raise_for_status() post_ids = list() response_json = response.json()['data'] pagination = response.json()['pagination'] # Instagram API result includes post with min_id so remove it response_json[:] = [d for d in response_json if d.get('id') != min_id] for gram in response_json: if gram['caption'] is not None: text = gram['caption']['text'] else: text = None post = Post( author, gram['id'], datetime.fromtimestamp(int(gram['created_time'])), text ) if gram['location'] is not None: if 'latitude' in gram['location']: post.latitude = gram['location']['latitude'] post.longitude = gram['location']['longitude'] if 'name' in gram['location']: post.location = gram['location']['name'] if 'street_address' in gram['location']: post.location += ' ' + gram['location']['street_address'] if 'images' in gram: image_url = gram['images']['standard_resolution']['url'] name = os.path.basename(urlparse(image_url).path) img_response = requests.get(image_url, verify=False) mime = img_response.headers['Content-type'] image = img_response.content post.attachments.append(File(name, mime, image)) db.add(post) db.flush() post_ids.append(post.id) worker.update_job(current=results) results += 1 if results == max_results: break # If there are more results, set the max_id param, otherwise finish if 'next_max_id' in pagination: params['max_id'] = pagination['next_max_id'] else: break db.commit() worker.finish_job() redis.publish('profile_posts', json.dumps({'id': id_})) app.queue.schedule_index_posts(post_ids)