Beispiel #1
0
def reprocess_instagram_profiles(friends_lower_bound=50000, period_weeks=2):
    """
    This task should run periodically by default.
    - It finds instagram profiles that are marked as "undecided" and have
      more than 'friends_lower_bound' followers.
    - Refetch web data for these profiles and try to classify them again
    """
    from social_discovery.models import InstagramProfile

    pipeline = pipelines.BasicClassifierPipeline()

    for profile in InstagramProfile.objects.filter(
            date_created__lt=datetime.now() - timedelta(weeks=period_weeks),
            friends_count__gte=friends_lower_bound,
            tags__regex='(^| )undecided( |$)',
            reprocess_tries_count__lt=MAX_INSTAGRAM_REFETCH_RETRY_COUNT,
    ).order_by('-reprocess_tries_count'):
        log.info('Reprocessing profile id: {}; name: {}'.format(
            profile.id, profile.username))
        crawler_task.apply_async(kwargs={
            'klass_name': pipeline.PIPELINE_ROUTE[0],
            'task_type': 'pipeline',
            'profile_id': profile.id,
            'route': pipeline.PIPELINE_ROUTE,
        },
                                 queue=REPROCESS_PROFILES_QUEUE_NAME)
        profile.reprocess_tries_count += 1
        profile.save()
Beispiel #2
0
def task_refetch_profiles_scheduled_in_10_days_later():
    """
    This task refetches profiles that were scheduled to be refetched acording to date_to_fetch_later field.
    Their data is refetched.

    If they get different description or different url - then passing it to the same pipeline as it was
    originally a part of.
    :return:
    """

    # TODO: start this in settings on a daily basis

    from social_discovery.pipeline_constants import QUEUE_TO_REFETCH_PROFILES
    from social_discovery.models import InstagramProfile

    today_min = datetime.combine(date.today(), time.min)
    today_max = datetime.combine(date.today(), time.max)
    profile_ids_to_re_perform = InstagramProfile.objects.filter(
        date_to_fetch_later__range=(today_min,
                                    today_max)).values_list('id', flat=True)

    log.info('Issuing Celery tasks to refetch profiles: %s' %
             len(profile_ids_to_re_perform))

    ctr = 0
    for profile_id in profile_ids_to_re_perform:

        crawler_task.apply_async(
            kwargs={
                'profile_id': profile_id,
            },
            # TODO: overriden for comfort
            queue=QUEUE_TO_REFETCH_PROFILES)
        ctr += 1
    log.info('Issued Celery tasks to refetch profiles: %s' % ctr)
Beispiel #3
0
def detect_social_urls_for_have_youtube(qty=1000):
    """
    Will perform all have_youtube for getting urls as pipeline (with celery queue)

    :return:
    """

    from social_discovery.models import InstagramProfile
    from social_discovery.pipeline_constants import get_queue_name_by_pipeline_step

    initial_profiles = InstagramProfile.objects.filter(
        tags__contains="have_youtube",
        # friends_count__gte=5000
    ).exclude(tags__contains='mom').filter(
        tags__contains='blogger').order_by('id').values_list("id", flat=True)

    if qty is not None:
        initial_profiles = initial_profiles[:qty]

    log.info('Initial profiles found: %s' % initial_profiles.count())

    # issuing tasks
    pipeline = pipelines.HaveYoutubeDiscoverUrlsPipeline()

    for ip_id in list(initial_profiles):
        crawler_task.apply_async(
            kwargs={
                'klass_name': pipeline.PIPELINE_ROUTE[0],
                'task_type': 'pipeline',
                'profile_id': ip_id,
                'route': pipeline.PIPELINE_ROUTE,
            },
            queue=get_queue_name_by_pipeline_step(
                pipeline.PIPELINE_ROUTE[0])  # PIPELINE_QUEUE_NAME
        )
Beispiel #4
0
def detect_social_urls_for_profiles(must_have_tags='have_youtube',
                                    exclude_tags=None,
                                    friends_threshold=1000,
                                    qty=1000):
    """
    Will perform all new_mommy_hashtags for getting urls as pipeline  (with celery queue)

    :return:
    """

    from social_discovery.models import InstagramProfile
    from social_discovery.pipeline_constants import get_queue_name_by_pipeline_step

    initial_profiles = InstagramProfile.objects.filter(
        tags__contains=must_have_tags,
        friends_count__gte=friends_threshold,
    ).order_by('id')

    if exclude_tags:
        initial_profiles = initial_profiles.exclude(
            tags__contains=exclude_tags)

    blogs = initial_profiles.filter(tags__contains='blogger')
    undecided = initial_profiles.filter(tags__contains='undecided')

    final_profiles = blogs  #| undecided

    final_profiles = final_profiles.values_list("id", flat=True)

    if qty is not None:
        final_profiles = final_profiles[:qty]

    log.info('Initial profiles found: %s' % final_profiles.count())

    # issuing tasks
    pipeline = pipelines.HaveYoutubeDiscoverUrlsPipeline()

    for ip_id in list(final_profiles):
        crawler_task.apply_async(
            kwargs={
                'klass_name': pipeline.PIPELINE_ROUTE[0],
                'task_type': 'pipeline',
                'profile_id': ip_id,
                'route': pipeline.PIPELINE_ROUTE,
            },
            queue=get_queue_name_by_pipeline_step(
                pipeline.PIPELINE_ROUTE[0])  # PIPELINE_QUEUE_NAME
        )
Beispiel #5
0
    def create_new_profiles(self,
                            hashtags=None,
                            submission_tracker=None,
                            num_pages_to_load=20,
                            pipeline_class=None,
                            **kwargs):
        """
        Iterates over a list of hashtags by mask https://instagram.com/explore/tags/<hashtag>/
        Issues a task to perform

        Note: 'hashtags' should be a dict with categories and tags like:
        {'singapore': ['oo7d', 'anothertag', 'onemoretag', ...], ...}

        """
        if type(hashtags) != dict:
            log.error(
                'hashtags parameter should be a dict of categories and lists '
                'of their corresponding hashtags, not a %s' % type(hashtags))
            return None

        log.info('Issuing tasks to obtain profiles for hashtags: %s' %
                 hashtags)
        # print('hashtags: %s   num_pages: %s' % (hashtags, num_pages_to_load))

        with OpRecorder('instagram_crawl_scrape_instagram_feeds'):
            categories = hashtags.keys()
            for cat in categories:
                tags = hashtags[cat]
                for tag in tags:
                    crawler_task.apply_async(
                        kwargs={
                            'klass_name': 'CreatorByInstagramHashtags',
                            'task_type': 'perform_feed',
                            'tag': tag,
                            'num_pages': num_pages_to_load,
                            'category': cat,
                            'pipeline_class': pipeline_class
                        },
                        queue=
                        'instagram_feed_scraper'  # Queue where tasks to perform separate feeds are put
                    )

                    if submission_tracker is not None:
                        submission_tracker.count_task(
                            'crawlers.scrape_instagram_feed_for_tag')
Beispiel #6
0
    def pipeline(self, profile_id=None, route=None, **kwargs):
        """
        Performing single profile and deciding if it will go further by pipeline's route.
        """

        log.info('Started %s.pipeline(profile_id=%s, route=%s)' %
                 (type(self).__name__, profile_id, route))
        # Fetching data from kwargs
        try:
            profile = InstagramProfile.objects.get(id=profile_id)
            category = self.classify_unit(profile)

            profile.append_mutual_exclusive_tag(category,
                                                self.AVAILABLE_CATEGORIES)

            # creating a SocialProfileOp object for this event
            SocialProfileOp.objects.create(
                profile_id=profile.id,
                description=category,
                module_classname=type(self).__name__,
                data={})

            log.info('category=%s' % category)

            # proceeding with pipeline route if result is suitable
            if type(route) is list and len(route) > 1 and self.proceed(
                    result=category):
                log.info('Proceeding to the next step: %s' % route[1])
                crawler_task.apply_async(kwargs={
                    'klass_name': route[1],
                    'task_type': 'pipeline',
                    'profile_id': profile.id,
                    'route': route[1:],
                },
                                         queue=get_queue_name_by_pipeline_step(
                                             route[1]))
            else:
                log.info(
                    'Route finished or terminating route because of result.')

        except InstagramProfile.DoesNotExist:
            log.error('InstagramProfile with id: %s does not exist, exiting.' %
                      profile_id)
Beispiel #7
0
def task_discover_existing_platforms(must_have_tags='have_youtube',
                                     exclude_tags=None,
                                     friends_threshold=1000,
                                     qs=None):

    if qs is None:
        from social_discovery.models import InstagramProfile
        initial_profiles = InstagramProfile.objects.filter(
            tags__contains=must_have_tags,
            friends_count__gte=friends_threshold,
        ).order_by('id')
        if exclude_tags:
            initial_profiles = initial_profiles.exclude(
                tags__contains=exclude_tags)

        blogs = initial_profiles.filter(tags__contains='blogger')
        undecided = initial_profiles.filter(tags__contains='undecided')

        #final_profiles = blogs | undecided
        final_profiles = blogs

        qs = final_profiles.values_list("id", flat=True)

    # issuing tasks
    pipeline = pipelines.HaveYoutubeDiscoverPlatformsPipeline()

    for ip_id in list(qs):
        crawler_task.apply_async(
            kwargs={
                'klass_name': pipeline.PIPELINE_ROUTE[0],
                'task_type': 'pipeline',
                'profile_id': ip_id,
                'route': pipeline.PIPELINE_ROUTE,
            },
            # TODO: overriden for comfort
            queue='profiles_pipeline_upgraders_youtube'  # PIPELINE_QUEUE_NAME
        )
Beispiel #8
0
    def run_pipeline(self, data=None):
        """
        This function runs pipeline for execution.
        """
        if not self.PIPELINE_ROUTE or not isinstance(self.PIPELINE_ROUTE, (
                list,
                tuple,
        )):
            log.error(
                ('Pipeline route is empty or incorrectly given: {}, exiting.'
                 ).format(self.PIPELINE_ROUTE))
            return

        if type(data) in [int, str]:
            queryset = InstagramProfile.objects.filter(id=data)
        elif isinstance(data, list):
            queryset = InstagramProfile.objects.filter(id__in=data)
        elif isinstance(data, QuerySet):
            queryset = data
        else:
            # TODO: Maybe fetch all profiles for the last day?
            queryset = InstagramProfile.objects.filter(
                friends_count__gte=self.DEFAULT_MINIMUM_FRIENDS_COUNT)

        profiles = queryset_iterator(queryset)

        log.info('Performing %s profiles...' % queryset.count())

        for profile in profiles:
            crawler_task.apply_async(kwargs={
                'klass_name': self.PIPELINE_ROUTE[0],
                'task_type': 'pipeline',
                'profile_id': profile.id,
                'route': self.PIPELINE_ROUTE,
            },
                                     queue=get_queue_name_by_pipeline_step(
                                         self.PIPELINE_ROUTE[0]))
Beispiel #9
0
def task_connect_instagramprofile_to_influencers(must_have_tags=None,
                                                 exclude_tags=None,
                                                 friends_threshold=1000,
                                                 limit=1000,
                                                 qs=None):
    """
    This task should run periodically by default.
    - It finds instagram profiles that are not already connected to
      influencers.
    - Then it filters by friends_count and 'blogger' tag
    - If less than a certain threshold are available, we also check with
      undecided
    """

    from social_discovery.models import InstagramProfile
    from social_discovery.pipeline_constants import CONNECT_PROFILES_QUEUE_NAME

    if not qs:
        initial_profiles = InstagramProfile.objects.filter(
            friends_count__gte=friends_threshold)

        if must_have_tags:
            initial_profiles = initial_profiles.filter(
                tags__contains=must_have_tags)

        if exclude_tags:
            initial_profiles = initial_profiles.exclude(
                tags__contains=exclude_tags)

        # we don't want to process profiles that already have a connected
        # influencer
        initial_profiles = initial_profiles.filter(
            discovered_influencer__isnull=True)

        blogs = initial_profiles.filter(tags__contains='blogger')

        # use undecided only if the blog profiles are not enough
        if blogs.count() < limit:
            undecided = initial_profiles.filter(
                tags__contains='undecided').filter(
                    tags__contains='SHORT_BIO_50')
            final_profiles = blogs | undecided
        else:
            final_profiles = blogs

        qs = final_profiles.values_list(
            'id', flat=True).order_by('-friends_count')[:limit]
    else:
        qs = qs.values_list('id', flat=True).order_by('id')

    # issuing tasks
    pipeline = pipelines.ConnectInstagramProfilesToInfluencersPipeline()

    for ip_id in list(qs):
        crawler_task.apply_async(
            kwargs={
                'klass_name': pipeline.PIPELINE_ROUTE[0],
                'task_type': 'pipeline',
                'profile_id': ip_id,
                'route': pipeline.PIPELINE_ROUTE,
            },
            # TODO: overriden for comfort
            queue=CONNECT_PROFILES_QUEUE_NAME)
Beispiel #10
0
    def perform_feed(self,
                     tag,
                     num_pages,
                     category,
                     pipeline_class=None,
                     **kwargs):
        """
        This scrapes the instagram tags page for a given tag
        blog_discovery.hashtags[category] = {list of tags}.
        """
        with OpRecorder('instagram_crawl_feed_for_tag'):
            from xpathscraper import xbrowser
            from django.conf import settings
            page_count = 0
            image_urls = set()
            old_image_urls_count = 0
            log.info("Starting scraping for tag %r" % tag)
            with xbrowser.XBrowser(
                    headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY,
                    load_no_images=True) as xb:
                url = 'https://instagram.com/explore/tags/%s/' % tag
                xb.load_url(url)
                time.sleep(2)

                # checking the number of posts if it is already in cache
                posts_qty = None
                posts_qty_nodes = xb.driver.find_elements_by_xpath(
                    '//header/span/span[@class]')
                if len(posts_qty_nodes) > 0:
                    try:
                        posts_qty = posts_qty_nodes[0].text
                        posts_qty = int(posts_qty.strip().replace(',', ''))
                        cached_posts_qty = cache.get('instagram_tag__%s' % tag)
                        if cached_posts_qty is not None and (
                                posts_qty - int(cached_posts_qty)) <= 100:
                            log.info(
                                'Cached posts quantity is %s, now it is %s, '
                                'too few new posts - skipping this feed.' %
                                (cached_posts_qty, posts_qty))
                            return
                        else:
                            log.info(
                                'Cached posts quantity is %s, now it is %s, performing this feed.'
                                % (cached_posts_qty, posts_qty))
                    except ValueError:
                        log.error(
                            'Could not parse posts quantity to number: %s, please check format'
                            % posts_qty)
                else:
                    log.info(
                        'No posts quantity node detected, possible Instagram page HTML structure changed.'
                    )

                # scroll to the bottom before we can find the 'load more pages' button
                xb.driver.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight);")
                while page_count < num_pages:
                    # find all images on the page so far and add them to our set
                    try:
                        # images = xb.driver.find_elements_by_xpath('//div[contains(@class, "PostsGrid__root")]//a')
                        # Instagram structure changed
                        images = xb.driver.find_elements_by_xpath(
                            '//article//a')
                    except:
                        page_count = num_pages
                        continue
                    all_image_urls = set()
                    for i in images:
                        all_image_urls.add(i.get_attribute('href'))

                    new_image_urls = all_image_urls - image_urls
                    image_urls = all_image_urls
                    if len(image_urls) == old_image_urls_count:
                        page_count = num_pages
                        continue
                    old_image_urls_count = len(image_urls)

                    print(
                        "new images: %d so far we have %d image urls for tag %r"
                        % (len(new_image_urls), len(image_urls), tag))
                    for i in new_image_urls:
                        try:
                            crawler_task.apply_async(
                                kwargs={
                                    'klass_name': 'CreatorByInstagramHashtags',
                                    'task_type': 'create_profile',
                                    'url': i,
                                    'tag': tag,
                                    'category': category,
                                    'pipeline_class': pipeline_class
                                },
                                # Queue where tasks to create new profiles for separate posts in feed are put
                                queue='scrape_instagram_posts_new',
                            )
                        except:
                            print("some error for %s" % i)
                            pass
                    # find the next page button
                    # el = xb.driver.find_elements_by_xpath('//div[contains(@class, "moreLoadingIndicator")]//a')
                    el = xb.driver.find_elements_by_xpath(
                        '//a[contains(text(), "Load more")]')

                    if page_count == 0 and len(el) > 0:
                        e = el[0]
                        e.click()
                        log.info(
                            "Found next page button for page %s successfully, clicking and waiting."
                            % page_count)

                    else:
                        log.info(
                            "'Load More Pics' button not found... returning.")
                        #page_count = num_pages
                        # scroll to the bottom before we can find the 'load more pages' button
                        xb.driver.execute_script("window.scrollTo(0, 50);")
                        xb.driver.execute_script(
                            "window.scrollTo(0, 1000000);")
                    time.sleep(3)
                    page_count += 1

                # caching post quantity for this tag
                if tag is not None and isinstance(posts_qty, int):
                    cache.set('instagram_tag__%s' % tag, posts_qty)