Python queryset_iteratorの例、social_discovery.blog_discovery.queryset_iterator Pythonの例

コード例 #1

0

ファイルを表示

ファイル: cleaning.py プロジェクト: khsr/django-shelf

def copy_part_1():
    """
    Part I of copying data:

        django.contrib.auth.models.User  (Django's users table)
        debra.DemographicsLocality  (Has no FKs, is a "reference" ? Should be copied with ALL its data?)
        debra.BrandCategory  (Has no FKs, is a "reference" ? Should be copied with ALL its data?)
        debra.Category    (Has no FKs, is a "reference" ? Should be copied with ALL its data?)
        debra.Brands (Has ManyToMany: debra.BrandCategory, debra.Brands     is a "reference" ? Should be copied with ALL its data?)
        debra.UserProfile  (Has OneToOne to: django.contrib.auth.models.User, debra.Brands, FK to: debra.Influencer)
        debra.Influencer  (Has FK to: django.contrib.auth.models.User, debra.DemographicsLocality, debra.UserProfile)

    :return:
    """

    print('%s Copying DemographicsLocality...' %
          datetime.datetime.now().strftime("[%H:%M:%S]"))
    users = DemographicsLocality.objects.using('default').all()
    ctr = 0
    for obj in queryset_iterator(users):
        obj.save(using='production')
        ctr += 1
        if ctr % 1000 == 0:
            print('%s Saved %s DemographicsLocality models' %
                  (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr))
    print('%s Copied %s DemographicsLocality objects.' %
          (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr))

    print('%s Copying BrandCategory...' %
          datetime.datetime.now().strftime("[%H:%M:%S]"))
    users = BrandCategory.objects.using('default').all()
    ctr = 0
    for obj in queryset_iterator(users):
        obj.save(using='production')
        ctr += 1
        if ctr % 1000 == 0:
            print('%s Saved %s BrandCategory models' %
                  (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr))
    print('%s Copied %s BrandCategory objects.' %
          (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr))

    print('%s Copying Category...' %
          datetime.datetime.now().strftime("[%H:%M:%S]"))
    users = Category.objects.using('default').all()
    ctr = 0
    for obj in queryset_iterator(users):
        obj.save(using='production')
        ctr += 1
        if ctr % 1000 == 0:
            print('%s Saved %s Category models' %
                  (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr))
    print('%s Copied %s Category objects.' %
          (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr))

コード例 #2

0

ファイルを表示

    def classify_queryset(self,
                          queryset=None,
                          category=None,
                          to_tag=True,
                          **kwargs):
        """
        Helper method. Source_queryset should be a queryset for InstagramProfiles.
        Same as above but performs the whole queryset.
        Could return a dict of pairs ' id: classification_value '
        or a queryset object with excluding by ids.

        Example:
        We want to filter queryset so only bloggers should remain:
        we call the function as
        cs.classify_queryset(source_queryset=qs, category='blogger')

        Method's drawback: could be extremely time consuming queryset.
         Could be changed by filtering out ids of objects.


        :param to_tag if set True, then category tag will be set to these profiles
        """

        if category not in self.AVAILABLE_CATEGORIES:
            return queryset

        profiles = queryset_iterator(queryset)

        ids = set()

        for profile in profiles:
            biography = profile.api_data.get('biography')
            if biography is not None and self.classify_unit(
                    profile) == category:
                ids.add(profile.id)

                # setting tag for classified profiles
                if to_tag:

                    # profile.append_mutual_exclusive_tag(category, self.AVAILABLE_CATEGORIES)
                    if category is not None:
                        profile.append_mutual_exclusive_tag(
                            category, self.AVAILABLE_CATEGORIES)
                    elif profile.tags is not None and any(
                        [t in profile.tags
                         for t in self.AVAILABLE_CATEGORIES]):
                        profile.tags = ' '.join([
                            t for t in profile.tags.split()
                            if t not in self.AVAILABLE_CATEGORIES
                        ])
                        profile.save()

                    # creating a SocialProfileOp object for this event
                    SocialProfileOp.objects.create(
                        profile_id=profile.id,
                        description=category,
                        module_classname=type(self).__name__,
                        data={})

        return queryset.filter(id__in=ids)

コード例 #3

0

ファイルを表示

    def _get_required_number_of_posts_for_each_platform(
            self, per_influencer=True):
        if per_influencer:
            res = defaultdict(int)

            contract_ids = list(
                self.campaign.participating_post_analytics.exclude(
                    contract__isnull=True).distinct('contract').values_list(
                        'contract', flat=True))
            contracts_queryset = models.Contract.objects.filter(
                id__in=contract_ids)

            contracts_iterator = queryset_iterator(contracts_queryset)
            for contract in contracts_iterator:
                d = contract.deliverables_json
                for pl in PLATFORMS:
                    res[pl] += d.get(pl.lower(), {}).get('value', 0) or 0
            return res
        else:
            d = self.campaign.deliverables_json
            return {
                pl:
                d.get(pl.lower(), {}).get('value', 0) * self.influencers_count
                for pl in PLATFORMS
            }

コード例 #4

0

ファイルを表示

ファイル: scripts.py プロジェクト: khsr/django-shelf

def find_squarespace_platforms(inf_ids=None):
    """
    Trying to find potential influencers with squarespace (currently just few for test)
    :param limit:
    :return:
    """

    plats = Platform.objects.filter(platform_name='Custom').filter(
        influencer__show_on_search=True).exclude(influencer__blacklisted=True)

    if isinstance(inf_ids, list):
        plats = plats.filter(influencer_id__in=inf_ids)

    print('Found %s potential platforms to check' % plats.count())

    ctr = 0
    bad_result = []
    unreachable = []

    csvfile = io.open(
        'squarespace_detection_report__%s.csv' %
        datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d_%H%M%S'),
        'w+',
        encoding='utf-8')

    csvfile.write(
        u'Platform id\tPlatform1 url\tPlatform initial name\tPlatform detected name\tError\n'
    )

    for plat in queryset_iterator(plats):

        if plat.posts_set.filter(inserted_datetime__gte=datetime.datetime(
                2016, 03, 01)).count() > 0:
            continue

        initial_pn = plat.platform_name
        error = u''
        try:
            is_squarespace = check_if_squarespace_url(plat.url)
            if is_squarespace is True:
                plat.platform_name = 'Squarespace'
                plat.save()
            elif is_squarespace is None:
                unreachable.append(plat.id)
                error = 'Unreachable'
            else:
                print('Plat %s is NOT Squarespace' % plat.id)
        except:
            bad_result.append(plat.id)
            error = 'Got Exception'

        final_pn = plat.platform_name

        ctr += 1
        if ctr % 1000 == 0:
            print('Performed %s platforms' % ctr)

        csvfile.write(u'%s\t%s\t%s\t%s\t%s\n' %
                      (plat.id, plat.url, initial_pn, final_pn, error))

コード例 #5

0

ファイルを表示

ファイル: cache_utils.py プロジェクト: khsr/django-shelf

 def run(self):
     # @todo: remove that filter
     campaigns = models.BrandJobPost.objects.all().filter(id__in=[705, 355])
     total = campaigns.count()
     for n, campaign in enumerate(queryset_iterator(campaigns), start=1):
         wrapper = campaign_helpers.CampaignReportDataWrapper(campaign)
         wrapper.save_to_cache()
         print '* {}/{}'.format(n, total)

コード例 #6

0

ファイルを表示

ファイル: scripts.py プロジェクト: khsr/django-shelf

def update_facebook_urls_for_campaigns(campaign_ids=None):
    """
    This script updates posts for Facebook platforms for influencers involved in campaigns (all or specified)
    :param campaign_ids:
    :return:
    """

    from platformdatafetcher.pbfetcher import IndepthPolicy

    if campaign_ids is None:
        brand_job_posts = BrandJobPost.objects.all()
    elif type(campaign_ids) is int:
        brand_job_posts = BrandJobPost.objects.filter(id=campaign_ids)
    else:
        brand_job_posts = BrandJobPost.objects.filter(id__in=campaign_ids)

    # getting ids of all influencers in campaigns
    inf_ids = set()

    log.info('Collecting influencers to perform...')
    for bjp in queryset_iterator(brand_job_posts):

        # Initial data
        bjp_inf_ids = list(
            bjp.candidates.filter(campaign_stage__gte=3).values_list(
                'mailbox__influencer__id', flat=True))
        for iid in bjp_inf_ids:
            if iid is not None:
                inf_ids.add(iid)

    log.info('Found %s distinct influencers, performing them' % len(inf_ids))

    # policy to perform
    policy = IndepthPolicy()

    for inf_id in inf_ids:
        try:

            inf = Influencer.objects.get(id=inf_id)
            log.info('Performing influencer %s (%s)' % (inf.id, inf.blogname))
            fb_platforms = inf.platform_set.filter(
                platform_name='Facebook').exclude(url_not_found=True)

            log.info(
                'This influencer has %s Facebook platforms without url_not_found=True'
            )

            for plat in fb_platforms:
                log.info('Performing posts for platform %s (%s)' %
                         (plat.id, plat.url))

                pf = UpdatingFacebookFetcher(plat, policy)
                posts = pf.fetch_posts(max_pages=5)

                log.info('5 pages of platform %s were performed' % plat.id)

        except Influencer.DoesNotExist:
            log.error('Influencer %s was not found' % inf_id)

コード例 #7

0

ファイルを表示

ファイル: tasks.py プロジェクト: khsr/django-shelf

def normalize_influencer_locations():
    from debra import models
    from platformdatafetcher import geocoding
    from social_discovery.blog_discovery import queryset_iterator

    # 96761 bloggers
    infs = models.Influencer.objects.filter(old_show_on_search=True).exclude(
        source__contains='brand').exclude(blacklisted=True)
    total = infs.count()
    changed_count = 0
    for n, inf in enumerate(queryset_iterator(infs), start=1):
        print '******* {}/{} *******'.format(n, total)
        changed_count += int(bool(geocoding.handle_influencer_demographics(inf, diff_only=True)))
        print '(changed count: {})'.format(changed_count)

コード例 #8

0

ファイルを表示

ファイル: cleaning.py プロジェクト: khsr/django-shelf

def copy_brand_slices(start, end):
    print('%s Copying Brands...' %
          datetime.datetime.now().strftime("[%H:%M:%S]"))
    user_ids = Brands.objects.using('default').all().order_by(
        'id').values_list('id', flat=True)[start:end]
    users = Brands.objects.filter(id__in=user_ids)
    ctr = 0
    for obj in queryset_iterator(users):
        obj.save(using='production')
        ctr += 1
        if ctr % 1000 == 0:
            print('%s Saved %s Brands models' %
                  (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr))
    print('%s Copied %s Brands objects.' %
          (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr))

コード例 #9

0

ファイルを表示

ファイル: cleaning.py プロジェクト: khsr/django-shelf

def copy_user_profiles_slicks(start, end):
    print('%s Copying UserProfile (!)...' %
          datetime.datetime.now().strftime("[%H:%M:%S]"))
    user_ids = UserProfile.objects.using('default').all().order_by(
        'id').values_list('id', flat=True)[start:end]
    users = UserProfile.objects.filter(id__in=user_ids)
    ctr = 0
    for obj in queryset_iterator(users):
        obj.influencer = None
        obj.save(using='production')
        ctr += 1
        if ctr % 1000 == 0:
            print('%s Saved %s UserProfile models' %
                  (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr))
    print('%s Copied %s UserProfile objects.' %
          (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr))

コード例 #10

0

ファイルを表示

def generate_report_social_urls_new_mommy():
    """
    Creates a csv report for collected social_urls in new_mommy_hashtags colelctions.

    :return:
    """
    initial_profiles = InstagramProfile.objects.filter(
        tags__contains="new_mommy_hashtags",
        friends_count__gte=5000
    ).filter(
        Q(social_urls_detected__isnull=False) | Q(non_social_urls_detected__isnull=False)
    ).order_by('id')

    log.info('Found %s InstagramProfiles' % initial_profiles.count())

    csvfile = io.open('social_urls_detected__mommy__%s.csv' % datetime.datetime.strftime(
        datetime.datetime.now(), '%Y-%m-%d_%H%M%S'), 'w+', encoding='utf-8')
    csvfile.write(
        u'InstagramProfile id\turl\tDescription\tExternal url\tsocial_urls_detected\tnon_social_urls_detected\tPlatforms found\tFirst 10 platform ids\tIC TAG\tDiscovered Influencer Id\tBlog Url\t\n'
    )

    for profile in queryset_iterator(initial_profiles):
        desc = profile.get_description_from_api()
        if desc is not None:
            desc = desc.replace(u'\t', u'').replace(u'\n', u'')
        found_plat_ids = profile.get_platform_ids_detected()

        ic_tags = profile.tags.split() if profile.tags is not None else []
        ic_tags = [ t for t in ic_tags if t.startswith('IC_') ]

        csvfile.write(
            u'%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\n' % (
                profile.id,
                profile.get_url(),
                desc,
                profile.get_url_from_api(),
                profile.social_urls_detected,
                profile.non_social_urls_detected,
                len(found_plat_ids),
                '' if len(found_plat_ids) == 0 else found_plat_ids[:10],
                ic_tags,
                profile.discovered_influencer.id if profile.discovered_influencer is not None else None,
                profile.discovered_influencer.blog_url if profile.discovered_influencer is not None else None,
            )
        )

    csvfile.close()

コード例 #11

0

ファイルを表示

    def classify_queryset(self,
                          queryset=None,
                          category=None,
                          to_tag=True,
                          **kwargs):
        """
        Helper method. Source_queryset should be a queryset for InstagramProfiles.
        Same as above but performs the whole queryset.
        Could return a dict of pairs ' id: classification_value '
        or a queryset object with excluding by ids.

        Example:
        We want to filter queryset so only bloggers should remain:
        we call the function as
        cs.classify_queryset(source_queryset=qs, category='blogger')

        :param to_tag if set True, then category tag will be set to these profiles
        """

        if category not in self.AVAILABLE_CATEGORIES:
            return queryset

        profiles = queryset_iterator(queryset)

        ids = set()

        for profile in profiles:
            if self.classify_unit(profile) == category:
                ids.add(profile.id)

                # setting tag for classified profiles
                if to_tag:
                    profile.append_mutual_exclusive_tag(
                        category, self.AVAILABLE_CATEGORIES)

                    # creating a SocialProfileOp object for this event
                    SocialProfileOp.objects.create(
                        profile_id=profile.id,
                        description=category,
                        module_classname=type(self).__name__,
                        data={})

        return queryset.filter(id__in=ids)

コード例 #12

0

ファイルを表示

ファイル: cleaning.py プロジェクト: khsr/django-shelf

def copy_influencers():
    print('%s Copying Influencer (!)...' %
          datetime.datetime.now().strftime("[%H:%M:%S]"))
    users = Influencer.objects.using('default').filter(
        show_on_search=True).exclude(blacklisted=True).exclude(
            blog_url__contains='artificial_blog')
    ctr = 0
    for obj in queryset_iterator(users):
        obj.save(using='production')
        if obj.shelf_user:
            up = obj.shelf_user.userprofile
            up.influencer = obj.id
            up.save(using='production')

        ctr += 1
        if ctr % 1000 == 0:
            print('%s Saved %s Influencer models' %
                  (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr))
    print('%s Copied %s Influencer objects.' %
          (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr))

コード例 #13

0

ファイルを表示

def generate_report_social_urls_have_youtube():
    """
    Creates a csv report for collected social_urls in new_mommy_hashtags colelctions.

    :return:
    """
    initial_profiles = InstagramProfile.objects.filter(
        tags__contains="have_youtube",
        friends_count__gte=5000).filter(
        Q(social_urls_detected__isnull=False) | Q(non_social_urls_detected__isnull=False)
    )

    log.info('Found %s InstagramProfiles' % initial_profiles.count())

    csvfile = io.open('social_urls_detected__have_youtube__%s.csv' % datetime.datetime.strftime(
        datetime.datetime.now(), '%Y-%m-%d_%H%M%S'), 'w+', encoding='utf-8')
    csvfile.write(
        u'InstagramProfile id\turl\tDescription\tExternal url\tsocial_urls_detected\tnon_social_urls_detected\tPlatforms found\tFirst 10 platform ids\t\n'
    )

    for profile in queryset_iterator(initial_profiles):
        desc = profile.get_description_from_api()
        if desc is not None:
            desc = desc.replace(u'\t', u'').replace(u'\n', u'')
        found_plat_ids = profile.get_platform_ids_detected()
        csvfile.write(
            u'%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\n' % (
                profile.id,
                profile.get_url(),
                desc,
                profile.get_url_from_api(),
                profile.social_urls_detected,
                profile.non_social_urls_detected,
                len(found_plat_ids),
                '' if len(found_plat_ids) == 0 else found_plat_ids[:10]
            )
        )

    csvfile.close()

コード例 #14

0

ファイルを表示

ファイル: utils.py プロジェクト: khsr/django-shelf

def hide_reporting_tab_for_new_brands(signup_threshold=None, count_only=False):
    from debra.models import Brands, User
    from social_discovery.blog_discovery import queryset_iterator

    signup_threshold = signup_threshold or datetime.date(2016, 2, 1)
    
    brands = Brands.objects.filter(is_subscribed=True)
    total = brands.count()

    brands_to_disable = []
    for n, brand in enumerate(queryset_iterator(brands), start=1):
        has_old_users = brand.related_user_profiles.filter(
            user_profile__user__date_joined__lt=signup_threshold
        ).count() > 0
        if not has_old_users:
            if not count_only:
                brand.flag_post_reporting_on = False
                brand.flag_report_roi_prediction = False
                brand.save()
            brands_to_disable.append(brand)
        print '* {}/{}, number of brands with hidden tab so far: {}'.format(
            n, total, len(brands_to_disable))
    return brands_to_disable

コード例 #15

0

ファイルを表示

ファイル: pipelines.py プロジェクト: khsr/django-shelf

    def run_pipeline(self, data=None):
        """
        This function runs pipeline for execution.
        """
        if not self.PIPELINE_ROUTE or not isinstance(self.PIPELINE_ROUTE, (
                list,
                tuple,
        )):
            log.error(
                ('Pipeline route is empty or incorrectly given: {}, exiting.'
                 ).format(self.PIPELINE_ROUTE))
            return

        if type(data) in [int, str]:
            queryset = InstagramProfile.objects.filter(id=data)
        elif isinstance(data, list):
            queryset = InstagramProfile.objects.filter(id__in=data)
        elif isinstance(data, QuerySet):
            queryset = data
        else:
            # TODO: Maybe fetch all profiles for the last day?
            queryset = InstagramProfile.objects.filter(
                friends_count__gte=self.DEFAULT_MINIMUM_FRIENDS_COUNT)

        profiles = queryset_iterator(queryset)

        log.info('Performing %s profiles...' % queryset.count())

        for profile in profiles:
            crawler_task.apply_async(kwargs={
                'klass_name': self.PIPELINE_ROUTE[0],
                'task_type': 'pipeline',
                'profile_id': profile.id,
                'route': self.PIPELINE_ROUTE,
            },
                                     queue=get_queue_name_by_pipeline_step(
                                         self.PIPELINE_ROUTE[0]))

コード例 #16

0

ファイルを表示

ファイル: tasks.py プロジェクト: khsr/django-shelf

def refetch_moz_data_for_platforms(start_id=None, end_id=None, moz_access_id=None, moz_secret_key=None):
    """
    Refetches all MOZ data for Blog non-artificial platforms.
    :param queryset:
    :return:
    """
    # TODO: envelop it in task and schedule it to once per month
    from debra.models import Platform
    from social_discovery.blog_discovery import queryset_iterator
    import time

    ctr = 0

    platforms = Platform.objects.filter(
        platform_name__in=Platform.BLOG_PLATFORMS,
        influencer__show_on_search=True,
        id__gte=start_id,
        id__lte=end_id,
    ).exclude(
        url_not_found=True
    ).exclude(
        url__startswith='http://www.theshelf.com/artificial_blog/'
    ).exclude(
        influencer__blacklisted=True
    ).exclude(
        moz_domain_authority__gte=0
    ).order_by('id')

    # for pl in queryset_iterator(platforms):
    #
    #     # if pl.influencer.score_popularity_overall is None:
    #     #     continue
    #
    #     pl.refetch_moz_data()
    #     ctr += 1
    #     print('%s  updated platform %s (%s / %s / %s)' % (
    #         ctr,
    #         pl.id,
    #         pl.moz_domain_authority,
    #         pl.moz_page_authority,
    #         pl.moz_external_links,
    #     ))
    #
    #     if ctr % 1000 == 0:
    #         log.info('Updated moz data for %s platforms' % ctr)
    #
    #     time.sleep(11)

    for pl in queryset_iterator(platforms):
        # if pl.influencer.score_popularity_overall is not None:
        #     continue

        pl.refetch_moz_data(moz_access_id, moz_secret_key)
        ctr += 1
        print('%s  updated platform %s (%s / %s / %s)' % (
            ctr,
            pl.id,
            pl.moz_domain_authority,
            pl.moz_page_authority,
            pl.moz_external_links,
        ))

        if ctr % 1000 == 0:
            log.info('Updated moz data for %s platforms' % ctr)

        time.sleep(11)

    log.info('Finished updating moz data for %s platforms' % ctr)