def schedule_username(username, site, group_id, total, tracker_id, test=False): ''' Queue a job to fetch results for the specified username from the specified site. Keyword arguments: test -- don't archive, update site with result (default: False) ''' kwargs = { 'username': username, 'site_id': site.id, 'group_id': group_id, 'total': total, 'tracker_id': tracker_id, 'test': test } job = _scrape_queue.enqueue_call(func=worker.scrape.check_username, kwargs=kwargs, timeout=_redis_worker['username_timeout']) description = 'Checking {} for user "{}"'.format(site.name, username) worker.init_job(job=job, description=description) return job.id
def schedule_profile_id(site, upstream_id, profile_id=None, stub=False): """ Queue a job to fetch the specified profile from a social media site. """ job = _scrape_queue.enqueue_call(func=worker.scrape.scrape_profile_by_id, args=(site, [upstream_id], stub), timeout=_redis_worker['profile_timeout']) description = 'Scraping bio for "{}" on {}'.format(upstream_id, site) worker.init_job(job=job, description=description, profile_id=profile_id)
def schedule_archive(username, group_id, tracker_id): ''' Queue a job to archive results for the job id. ''' job = _archive_queue.enqueue_call(func=worker.archive.create_archive, args=[username, group_id, tracker_id], timeout=_redis_worker['archive_timeout']) description = 'Archiving results for username "{}"'.format(username) worker.init_job(job=job, description=description)
def schedule_sleep_indeterminate(period): """ Schedule an indeterminate sleep task (useful for testing). """ description = 'Indeterminate sleep for {} seconds'.format(period) job = _scrape_queue.enqueue(worker.sleep.sleep_indeterminate, period, timeout=period + 1) worker.init_job(job, description)
def schedule_sleep_exception(period): """ Schedule a sleep task that raises an exception (useful for testing). """ description = 'Exception sleep for {} seconds'.format(period) job = _scrape_queue.enqueue(worker.sleep.sleep_exception, period, timeout=period + 1) worker.init_job(job, description)
def schedule_index_posts(post_ids): """ Queue a job to index the specified posts. """ job = _index_queue.enqueue_call(func=worker.index.index_posts, kwargs={'post_ids': post_ids}, timeout=_redis_worker['solr_timeout']) description = 'Indexing {} posts' \ .format(len(post_ids)) worker.init_job(job=job, description=description)
def schedule_index_profile(profile): """ Queue a job to index the specified profile. """ job = _index_queue.enqueue_call(func=worker.index.index_profile, args=[profile.id], timeout=_redis_worker['solr_timeout']) description = 'Indexing profile "{}" on {}' \ .format(profile.username, profile.site_name()) worker.init_job(job=job, description=description)
def schedule_delete_profile_from_index(profile_id): """ Queue a job to index the specified profile. """ job = _index_queue.enqueue_call(func=worker.index.delete_profile, args=[profile_id], timeout=_redis_worker['solr_timeout']) description = 'Deleting profile "{}" from index' \ .format(profile_id) worker.init_job(job=job, description=description)
def schedule_avatar(profile, avatar_url): """ Queue a job to fetch an avatar image for the specified profile. """ job = _scrape_queue.enqueue_call(func=worker.scrape.scrape_avatar, args=(profile.id, profile.site, avatar_url), timeout=_redis_worker['avatar_timeout']) description = 'Getting avatar image for "{}" on {}' \ .format(profile.username, profile.site_name()) worker.init_job(job=job, description=description, profile_id=profile.id)
def schedule_delete_profile_from_index(profile): """ Queue a job to delete the specified profile from the index. """ job = _index_queue.enqueue_call(func=worker.index.delete_profile, args=[profile.id], timeout=_redis_worker['solr_timeout']) description = 'Deleting profile "{}" on {} from index' \ .format(profile.username, profile.site_name()) worker.init_job(job=job, description=description)
def schedule_site_test(site, tracker_id): ''' Queue a job to test a site. Arguments: site -- the site to test. tracker_id -- the unique tracker ID for the job. ''' job = _scrape_queue.enqueue_call(func=worker.scrape.test_site, args=[site.id, tracker_id], timeout=30) description = 'Testing site "{}"'.format(site.name) worker.init_job(job=job, description=description) return job.id
def schedule_posts(profile, recent=True): """ Queue a job to get posts for the specified profile. """ scrapers = { 'instagram': worker.scrape.scrape_instagram_posts, 'twitter': worker.scrape.scrape_twitter_posts, } description = 'Getting posts for "{}" on {}' \ .format(profile.username, profile.site_name()) type_ = 'posts' job = _scrape_queue.enqueue_call(func=scrapers[profile.site], args=(profile.id, recent), timeout=_redis_worker['posts_timeout']) worker.init_job(job=job, description=description, profile_id=profile.id, type_=type_)
def schedule_relations(profile): """ Queue a job to get relations for the specified profile. """ scrapers = { 'instagram': worker.scrape.scrape_instagram_relations, 'twitter': worker.scrape.scrape_twitter_relations, } description = 'Getting friends & followers for "{}" on {}' \ .format(profile.username, profile.site_name()) type_ = 'relations' job = _scrape_queue.enqueue_call( func=scrapers[profile.site], args=[profile.id], timeout=_redis_worker['relations_timeout']) worker.init_job(job=job, description=description, profile_id=profile.id, type_=type_)
def schedule_profiles(profiles, stub=False): """ Queue jobs to fetch a list of profiles from a social media site. Profile scraping jobs are chunked according to maximum API request size Twitter: Supports 100 users per lookup: https://dev.twitter.com/rest/reference/get/users/lookup Instagram: Supports 1 user per lookup: https://instagram.com/developer/endpoints/users/#get_users_search Parameters: 'profiles' (list) - A list of profile dictionaries Each dictionary specifiies profile username or id and social media site name ("twitter", "instagram"). Example: profiles = [ { 'username': '******', 'site': 'twitter', }, { 'upstream_id': '343432', 'site': 'instagram', }, ... ] 'stub' (bool) - whether or not to import the profile as a stub """ # Aggregate profiles by site and API request type (username or ID) site_profiles = {} for profile in profiles: if profile['site'] not in site_profiles: site_profiles[profile['site']] = { 'username': [], 'upstream_id': [] } if 'upstream_id' in profile: site_profiles[profile['site']]['upstream_id'].append(profile) else: site_profiles[profile['site']]['username'].append(profile) # Spawn scraping jobs for site, type_profiles in site_profiles.items(): if site == 'twitter': chunk_size = 100 else: chunk_size = 1 # Break jobs into API request type - username or ID for type_, t_profiles in type_profiles.items(): # Chunk by API request size for i in range(0, len(t_profiles), chunk_size): chunk = t_profiles[i:i + chunk_size] if type_ == 'upstream_id': ids = [i['upstream_id'] for i in chunk] labels = _create_labels_dict(profiles=chunk, type_='upstream_id') job = _scrape_queue.enqueue_call( func=worker.scrape.scrape_profile_by_id, args=(site, ids, stub, labels), timeout=_redis_worker['profile_timeout']) else: usernames = [i['username'] for i in chunk] labels = _create_labels_dict(profiles=chunk, type_='username') job = _scrape_queue.enqueue_call( func=worker.scrape.scrape_profile, args=(site, usernames, stub, labels), timeout=_redis_worker['profile_timeout']) description = ('Scraping bios for {} {} profiles'.format( len(chunk), site)) worker.init_job(job=job, description=description)