def reprocess_instagram_profiles(friends_lower_bound=50000, period_weeks=2): """ This task should run periodically by default. - It finds instagram profiles that are marked as "undecided" and have more than 'friends_lower_bound' followers. - Refetch web data for these profiles and try to classify them again """ from social_discovery.models import InstagramProfile pipeline = pipelines.BasicClassifierPipeline() for profile in InstagramProfile.objects.filter( date_created__lt=datetime.now() - timedelta(weeks=period_weeks), friends_count__gte=friends_lower_bound, tags__regex='(^| )undecided( |$)', reprocess_tries_count__lt=MAX_INSTAGRAM_REFETCH_RETRY_COUNT, ).order_by('-reprocess_tries_count'): log.info('Reprocessing profile id: {}; name: {}'.format( profile.id, profile.username)) crawler_task.apply_async(kwargs={ 'klass_name': pipeline.PIPELINE_ROUTE[0], 'task_type': 'pipeline', 'profile_id': profile.id, 'route': pipeline.PIPELINE_ROUTE, }, queue=REPROCESS_PROFILES_QUEUE_NAME) profile.reprocess_tries_count += 1 profile.save()
def task_refetch_profiles_scheduled_in_10_days_later(): """ This task refetches profiles that were scheduled to be refetched acording to date_to_fetch_later field. Their data is refetched. If they get different description or different url - then passing it to the same pipeline as it was originally a part of. :return: """ # TODO: start this in settings on a daily basis from social_discovery.pipeline_constants import QUEUE_TO_REFETCH_PROFILES from social_discovery.models import InstagramProfile today_min = datetime.combine(date.today(), time.min) today_max = datetime.combine(date.today(), time.max) profile_ids_to_re_perform = InstagramProfile.objects.filter( date_to_fetch_later__range=(today_min, today_max)).values_list('id', flat=True) log.info('Issuing Celery tasks to refetch profiles: %s' % len(profile_ids_to_re_perform)) ctr = 0 for profile_id in profile_ids_to_re_perform: crawler_task.apply_async( kwargs={ 'profile_id': profile_id, }, # TODO: overriden for comfort queue=QUEUE_TO_REFETCH_PROFILES) ctr += 1 log.info('Issued Celery tasks to refetch profiles: %s' % ctr)
def detect_social_urls_for_have_youtube(qty=1000): """ Will perform all have_youtube for getting urls as pipeline (with celery queue) :return: """ from social_discovery.models import InstagramProfile from social_discovery.pipeline_constants import get_queue_name_by_pipeline_step initial_profiles = InstagramProfile.objects.filter( tags__contains="have_youtube", # friends_count__gte=5000 ).exclude(tags__contains='mom').filter( tags__contains='blogger').order_by('id').values_list("id", flat=True) if qty is not None: initial_profiles = initial_profiles[:qty] log.info('Initial profiles found: %s' % initial_profiles.count()) # issuing tasks pipeline = pipelines.HaveYoutubeDiscoverUrlsPipeline() for ip_id in list(initial_profiles): crawler_task.apply_async( kwargs={ 'klass_name': pipeline.PIPELINE_ROUTE[0], 'task_type': 'pipeline', 'profile_id': ip_id, 'route': pipeline.PIPELINE_ROUTE, }, queue=get_queue_name_by_pipeline_step( pipeline.PIPELINE_ROUTE[0]) # PIPELINE_QUEUE_NAME )
def detect_social_urls_for_profiles(must_have_tags='have_youtube', exclude_tags=None, friends_threshold=1000, qty=1000): """ Will perform all new_mommy_hashtags for getting urls as pipeline (with celery queue) :return: """ from social_discovery.models import InstagramProfile from social_discovery.pipeline_constants import get_queue_name_by_pipeline_step initial_profiles = InstagramProfile.objects.filter( tags__contains=must_have_tags, friends_count__gte=friends_threshold, ).order_by('id') if exclude_tags: initial_profiles = initial_profiles.exclude( tags__contains=exclude_tags) blogs = initial_profiles.filter(tags__contains='blogger') undecided = initial_profiles.filter(tags__contains='undecided') final_profiles = blogs #| undecided final_profiles = final_profiles.values_list("id", flat=True) if qty is not None: final_profiles = final_profiles[:qty] log.info('Initial profiles found: %s' % final_profiles.count()) # issuing tasks pipeline = pipelines.HaveYoutubeDiscoverUrlsPipeline() for ip_id in list(final_profiles): crawler_task.apply_async( kwargs={ 'klass_name': pipeline.PIPELINE_ROUTE[0], 'task_type': 'pipeline', 'profile_id': ip_id, 'route': pipeline.PIPELINE_ROUTE, }, queue=get_queue_name_by_pipeline_step( pipeline.PIPELINE_ROUTE[0]) # PIPELINE_QUEUE_NAME )
def create_new_profiles(self, hashtags=None, submission_tracker=None, num_pages_to_load=20, pipeline_class=None, **kwargs): """ Iterates over a list of hashtags by mask https://instagram.com/explore/tags/<hashtag>/ Issues a task to perform Note: 'hashtags' should be a dict with categories and tags like: {'singapore': ['oo7d', 'anothertag', 'onemoretag', ...], ...} """ if type(hashtags) != dict: log.error( 'hashtags parameter should be a dict of categories and lists ' 'of their corresponding hashtags, not a %s' % type(hashtags)) return None log.info('Issuing tasks to obtain profiles for hashtags: %s' % hashtags) # print('hashtags: %s num_pages: %s' % (hashtags, num_pages_to_load)) with OpRecorder('instagram_crawl_scrape_instagram_feeds'): categories = hashtags.keys() for cat in categories: tags = hashtags[cat] for tag in tags: crawler_task.apply_async( kwargs={ 'klass_name': 'CreatorByInstagramHashtags', 'task_type': 'perform_feed', 'tag': tag, 'num_pages': num_pages_to_load, 'category': cat, 'pipeline_class': pipeline_class }, queue= 'instagram_feed_scraper' # Queue where tasks to perform separate feeds are put ) if submission_tracker is not None: submission_tracker.count_task( 'crawlers.scrape_instagram_feed_for_tag')
def pipeline(self, profile_id=None, route=None, **kwargs): """ Performing single profile and deciding if it will go further by pipeline's route. """ log.info('Started %s.pipeline(profile_id=%s, route=%s)' % (type(self).__name__, profile_id, route)) # Fetching data from kwargs try: profile = InstagramProfile.objects.get(id=profile_id) category = self.classify_unit(profile) profile.append_mutual_exclusive_tag(category, self.AVAILABLE_CATEGORIES) # creating a SocialProfileOp object for this event SocialProfileOp.objects.create( profile_id=profile.id, description=category, module_classname=type(self).__name__, data={}) log.info('category=%s' % category) # proceeding with pipeline route if result is suitable if type(route) is list and len(route) > 1 and self.proceed( result=category): log.info('Proceeding to the next step: %s' % route[1]) crawler_task.apply_async(kwargs={ 'klass_name': route[1], 'task_type': 'pipeline', 'profile_id': profile.id, 'route': route[1:], }, queue=get_queue_name_by_pipeline_step( route[1])) else: log.info( 'Route finished or terminating route because of result.') except InstagramProfile.DoesNotExist: log.error('InstagramProfile with id: %s does not exist, exiting.' % profile_id)
def task_discover_existing_platforms(must_have_tags='have_youtube', exclude_tags=None, friends_threshold=1000, qs=None): if qs is None: from social_discovery.models import InstagramProfile initial_profiles = InstagramProfile.objects.filter( tags__contains=must_have_tags, friends_count__gte=friends_threshold, ).order_by('id') if exclude_tags: initial_profiles = initial_profiles.exclude( tags__contains=exclude_tags) blogs = initial_profiles.filter(tags__contains='blogger') undecided = initial_profiles.filter(tags__contains='undecided') #final_profiles = blogs | undecided final_profiles = blogs qs = final_profiles.values_list("id", flat=True) # issuing tasks pipeline = pipelines.HaveYoutubeDiscoverPlatformsPipeline() for ip_id in list(qs): crawler_task.apply_async( kwargs={ 'klass_name': pipeline.PIPELINE_ROUTE[0], 'task_type': 'pipeline', 'profile_id': ip_id, 'route': pipeline.PIPELINE_ROUTE, }, # TODO: overriden for comfort queue='profiles_pipeline_upgraders_youtube' # PIPELINE_QUEUE_NAME )
def run_pipeline(self, data=None): """ This function runs pipeline for execution. """ if not self.PIPELINE_ROUTE or not isinstance(self.PIPELINE_ROUTE, ( list, tuple, )): log.error( ('Pipeline route is empty or incorrectly given: {}, exiting.' ).format(self.PIPELINE_ROUTE)) return if type(data) in [int, str]: queryset = InstagramProfile.objects.filter(id=data) elif isinstance(data, list): queryset = InstagramProfile.objects.filter(id__in=data) elif isinstance(data, QuerySet): queryset = data else: # TODO: Maybe fetch all profiles for the last day? queryset = InstagramProfile.objects.filter( friends_count__gte=self.DEFAULT_MINIMUM_FRIENDS_COUNT) profiles = queryset_iterator(queryset) log.info('Performing %s profiles...' % queryset.count()) for profile in profiles: crawler_task.apply_async(kwargs={ 'klass_name': self.PIPELINE_ROUTE[0], 'task_type': 'pipeline', 'profile_id': profile.id, 'route': self.PIPELINE_ROUTE, }, queue=get_queue_name_by_pipeline_step( self.PIPELINE_ROUTE[0]))
def task_connect_instagramprofile_to_influencers(must_have_tags=None, exclude_tags=None, friends_threshold=1000, limit=1000, qs=None): """ This task should run periodically by default. - It finds instagram profiles that are not already connected to influencers. - Then it filters by friends_count and 'blogger' tag - If less than a certain threshold are available, we also check with undecided """ from social_discovery.models import InstagramProfile from social_discovery.pipeline_constants import CONNECT_PROFILES_QUEUE_NAME if not qs: initial_profiles = InstagramProfile.objects.filter( friends_count__gte=friends_threshold) if must_have_tags: initial_profiles = initial_profiles.filter( tags__contains=must_have_tags) if exclude_tags: initial_profiles = initial_profiles.exclude( tags__contains=exclude_tags) # we don't want to process profiles that already have a connected # influencer initial_profiles = initial_profiles.filter( discovered_influencer__isnull=True) blogs = initial_profiles.filter(tags__contains='blogger') # use undecided only if the blog profiles are not enough if blogs.count() < limit: undecided = initial_profiles.filter( tags__contains='undecided').filter( tags__contains='SHORT_BIO_50') final_profiles = blogs | undecided else: final_profiles = blogs qs = final_profiles.values_list( 'id', flat=True).order_by('-friends_count')[:limit] else: qs = qs.values_list('id', flat=True).order_by('id') # issuing tasks pipeline = pipelines.ConnectInstagramProfilesToInfluencersPipeline() for ip_id in list(qs): crawler_task.apply_async( kwargs={ 'klass_name': pipeline.PIPELINE_ROUTE[0], 'task_type': 'pipeline', 'profile_id': ip_id, 'route': pipeline.PIPELINE_ROUTE, }, # TODO: overriden for comfort queue=CONNECT_PROFILES_QUEUE_NAME)
def perform_feed(self, tag, num_pages, category, pipeline_class=None, **kwargs): """ This scrapes the instagram tags page for a given tag blog_discovery.hashtags[category] = {list of tags}. """ with OpRecorder('instagram_crawl_feed_for_tag'): from xpathscraper import xbrowser from django.conf import settings page_count = 0 image_urls = set() old_image_urls_count = 0 log.info("Starting scraping for tag %r" % tag) with xbrowser.XBrowser( headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, load_no_images=True) as xb: url = 'https://instagram.com/explore/tags/%s/' % tag xb.load_url(url) time.sleep(2) # checking the number of posts if it is already in cache posts_qty = None posts_qty_nodes = xb.driver.find_elements_by_xpath( '//header/span/span[@class]') if len(posts_qty_nodes) > 0: try: posts_qty = posts_qty_nodes[0].text posts_qty = int(posts_qty.strip().replace(',', '')) cached_posts_qty = cache.get('instagram_tag__%s' % tag) if cached_posts_qty is not None and ( posts_qty - int(cached_posts_qty)) <= 100: log.info( 'Cached posts quantity is %s, now it is %s, ' 'too few new posts - skipping this feed.' % (cached_posts_qty, posts_qty)) return else: log.info( 'Cached posts quantity is %s, now it is %s, performing this feed.' % (cached_posts_qty, posts_qty)) except ValueError: log.error( 'Could not parse posts quantity to number: %s, please check format' % posts_qty) else: log.info( 'No posts quantity node detected, possible Instagram page HTML structure changed.' ) # scroll to the bottom before we can find the 'load more pages' button xb.driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") while page_count < num_pages: # find all images on the page so far and add them to our set try: # images = xb.driver.find_elements_by_xpath('//div[contains(@class, "PostsGrid__root")]//a') # Instagram structure changed images = xb.driver.find_elements_by_xpath( '//article//a') except: page_count = num_pages continue all_image_urls = set() for i in images: all_image_urls.add(i.get_attribute('href')) new_image_urls = all_image_urls - image_urls image_urls = all_image_urls if len(image_urls) == old_image_urls_count: page_count = num_pages continue old_image_urls_count = len(image_urls) print( "new images: %d so far we have %d image urls for tag %r" % (len(new_image_urls), len(image_urls), tag)) for i in new_image_urls: try: crawler_task.apply_async( kwargs={ 'klass_name': 'CreatorByInstagramHashtags', 'task_type': 'create_profile', 'url': i, 'tag': tag, 'category': category, 'pipeline_class': pipeline_class }, # Queue where tasks to create new profiles for separate posts in feed are put queue='scrape_instagram_posts_new', ) except: print("some error for %s" % i) pass # find the next page button # el = xb.driver.find_elements_by_xpath('//div[contains(@class, "moreLoadingIndicator")]//a') el = xb.driver.find_elements_by_xpath( '//a[contains(text(), "Load more")]') if page_count == 0 and len(el) > 0: e = el[0] e.click() log.info( "Found next page button for page %s successfully, clicking and waiting." % page_count) else: log.info( "'Load More Pics' button not found... returning.") #page_count = num_pages # scroll to the bottom before we can find the 'load more pages' button xb.driver.execute_script("window.scrollTo(0, 50);") xb.driver.execute_script( "window.scrollTo(0, 1000000);") time.sleep(3) page_count += 1 # caching post quantity for this tag if tag is not None and isinstance(posts_qty, int): cache.set('instagram_tag__%s' % tag, posts_qty)