def copy_part_1(): """ Part I of copying data: django.contrib.auth.models.User (Django's users table) debra.DemographicsLocality (Has no FKs, is a "reference" ? Should be copied with ALL its data?) debra.BrandCategory (Has no FKs, is a "reference" ? Should be copied with ALL its data?) debra.Category (Has no FKs, is a "reference" ? Should be copied with ALL its data?) debra.Brands (Has ManyToMany: debra.BrandCategory, debra.Brands is a "reference" ? Should be copied with ALL its data?) debra.UserProfile (Has OneToOne to: django.contrib.auth.models.User, debra.Brands, FK to: debra.Influencer) debra.Influencer (Has FK to: django.contrib.auth.models.User, debra.DemographicsLocality, debra.UserProfile) :return: """ print('%s Copying DemographicsLocality...' % datetime.datetime.now().strftime("[%H:%M:%S]")) users = DemographicsLocality.objects.using('default').all() ctr = 0 for obj in queryset_iterator(users): obj.save(using='production') ctr += 1 if ctr % 1000 == 0: print('%s Saved %s DemographicsLocality models' % (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr)) print('%s Copied %s DemographicsLocality objects.' % (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr)) print('%s Copying BrandCategory...' % datetime.datetime.now().strftime("[%H:%M:%S]")) users = BrandCategory.objects.using('default').all() ctr = 0 for obj in queryset_iterator(users): obj.save(using='production') ctr += 1 if ctr % 1000 == 0: print('%s Saved %s BrandCategory models' % (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr)) print('%s Copied %s BrandCategory objects.' % (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr)) print('%s Copying Category...' % datetime.datetime.now().strftime("[%H:%M:%S]")) users = Category.objects.using('default').all() ctr = 0 for obj in queryset_iterator(users): obj.save(using='production') ctr += 1 if ctr % 1000 == 0: print('%s Saved %s Category models' % (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr)) print('%s Copied %s Category objects.' % (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr))
def classify_queryset(self, queryset=None, category=None, to_tag=True, **kwargs): """ Helper method. Source_queryset should be a queryset for InstagramProfiles. Same as above but performs the whole queryset. Could return a dict of pairs ' id: classification_value ' or a queryset object with excluding by ids. Example: We want to filter queryset so only bloggers should remain: we call the function as cs.classify_queryset(source_queryset=qs, category='blogger') Method's drawback: could be extremely time consuming queryset. Could be changed by filtering out ids of objects. :param to_tag if set True, then category tag will be set to these profiles """ if category not in self.AVAILABLE_CATEGORIES: return queryset profiles = queryset_iterator(queryset) ids = set() for profile in profiles: biography = profile.api_data.get('biography') if biography is not None and self.classify_unit( profile) == category: ids.add(profile.id) # setting tag for classified profiles if to_tag: # profile.append_mutual_exclusive_tag(category, self.AVAILABLE_CATEGORIES) if category is not None: profile.append_mutual_exclusive_tag( category, self.AVAILABLE_CATEGORIES) elif profile.tags is not None and any( [t in profile.tags for t in self.AVAILABLE_CATEGORIES]): profile.tags = ' '.join([ t for t in profile.tags.split() if t not in self.AVAILABLE_CATEGORIES ]) profile.save() # creating a SocialProfileOp object for this event SocialProfileOp.objects.create( profile_id=profile.id, description=category, module_classname=type(self).__name__, data={}) return queryset.filter(id__in=ids)
def _get_required_number_of_posts_for_each_platform( self, per_influencer=True): if per_influencer: res = defaultdict(int) contract_ids = list( self.campaign.participating_post_analytics.exclude( contract__isnull=True).distinct('contract').values_list( 'contract', flat=True)) contracts_queryset = models.Contract.objects.filter( id__in=contract_ids) contracts_iterator = queryset_iterator(contracts_queryset) for contract in contracts_iterator: d = contract.deliverables_json for pl in PLATFORMS: res[pl] += d.get(pl.lower(), {}).get('value', 0) or 0 return res else: d = self.campaign.deliverables_json return { pl: d.get(pl.lower(), {}).get('value', 0) * self.influencers_count for pl in PLATFORMS }
def find_squarespace_platforms(inf_ids=None): """ Trying to find potential influencers with squarespace (currently just few for test) :param limit: :return: """ plats = Platform.objects.filter(platform_name='Custom').filter( influencer__show_on_search=True).exclude(influencer__blacklisted=True) if isinstance(inf_ids, list): plats = plats.filter(influencer_id__in=inf_ids) print('Found %s potential platforms to check' % plats.count()) ctr = 0 bad_result = [] unreachable = [] csvfile = io.open( 'squarespace_detection_report__%s.csv' % datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d_%H%M%S'), 'w+', encoding='utf-8') csvfile.write( u'Platform id\tPlatform1 url\tPlatform initial name\tPlatform detected name\tError\n' ) for plat in queryset_iterator(plats): if plat.posts_set.filter(inserted_datetime__gte=datetime.datetime( 2016, 03, 01)).count() > 0: continue initial_pn = plat.platform_name error = u'' try: is_squarespace = check_if_squarespace_url(plat.url) if is_squarespace is True: plat.platform_name = 'Squarespace' plat.save() elif is_squarespace is None: unreachable.append(plat.id) error = 'Unreachable' else: print('Plat %s is NOT Squarespace' % plat.id) except: bad_result.append(plat.id) error = 'Got Exception' final_pn = plat.platform_name ctr += 1 if ctr % 1000 == 0: print('Performed %s platforms' % ctr) csvfile.write(u'%s\t%s\t%s\t%s\t%s\n' % (plat.id, plat.url, initial_pn, final_pn, error))
def run(self): # @todo: remove that filter campaigns = models.BrandJobPost.objects.all().filter(id__in=[705, 355]) total = campaigns.count() for n, campaign in enumerate(queryset_iterator(campaigns), start=1): wrapper = campaign_helpers.CampaignReportDataWrapper(campaign) wrapper.save_to_cache() print '* {}/{}'.format(n, total)
def update_facebook_urls_for_campaigns(campaign_ids=None): """ This script updates posts for Facebook platforms for influencers involved in campaigns (all or specified) :param campaign_ids: :return: """ from platformdatafetcher.pbfetcher import IndepthPolicy if campaign_ids is None: brand_job_posts = BrandJobPost.objects.all() elif type(campaign_ids) is int: brand_job_posts = BrandJobPost.objects.filter(id=campaign_ids) else: brand_job_posts = BrandJobPost.objects.filter(id__in=campaign_ids) # getting ids of all influencers in campaigns inf_ids = set() log.info('Collecting influencers to perform...') for bjp in queryset_iterator(brand_job_posts): # Initial data bjp_inf_ids = list( bjp.candidates.filter(campaign_stage__gte=3).values_list( 'mailbox__influencer__id', flat=True)) for iid in bjp_inf_ids: if iid is not None: inf_ids.add(iid) log.info('Found %s distinct influencers, performing them' % len(inf_ids)) # policy to perform policy = IndepthPolicy() for inf_id in inf_ids: try: inf = Influencer.objects.get(id=inf_id) log.info('Performing influencer %s (%s)' % (inf.id, inf.blogname)) fb_platforms = inf.platform_set.filter( platform_name='Facebook').exclude(url_not_found=True) log.info( 'This influencer has %s Facebook platforms without url_not_found=True' ) for plat in fb_platforms: log.info('Performing posts for platform %s (%s)' % (plat.id, plat.url)) pf = UpdatingFacebookFetcher(plat, policy) posts = pf.fetch_posts(max_pages=5) log.info('5 pages of platform %s were performed' % plat.id) except Influencer.DoesNotExist: log.error('Influencer %s was not found' % inf_id)
def normalize_influencer_locations(): from debra import models from platformdatafetcher import geocoding from social_discovery.blog_discovery import queryset_iterator # 96761 bloggers infs = models.Influencer.objects.filter(old_show_on_search=True).exclude( source__contains='brand').exclude(blacklisted=True) total = infs.count() changed_count = 0 for n, inf in enumerate(queryset_iterator(infs), start=1): print '******* {}/{} *******'.format(n, total) changed_count += int(bool(geocoding.handle_influencer_demographics(inf, diff_only=True))) print '(changed count: {})'.format(changed_count)
def copy_brand_slices(start, end): print('%s Copying Brands...' % datetime.datetime.now().strftime("[%H:%M:%S]")) user_ids = Brands.objects.using('default').all().order_by( 'id').values_list('id', flat=True)[start:end] users = Brands.objects.filter(id__in=user_ids) ctr = 0 for obj in queryset_iterator(users): obj.save(using='production') ctr += 1 if ctr % 1000 == 0: print('%s Saved %s Brands models' % (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr)) print('%s Copied %s Brands objects.' % (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr))
def copy_user_profiles_slicks(start, end): print('%s Copying UserProfile (!)...' % datetime.datetime.now().strftime("[%H:%M:%S]")) user_ids = UserProfile.objects.using('default').all().order_by( 'id').values_list('id', flat=True)[start:end] users = UserProfile.objects.filter(id__in=user_ids) ctr = 0 for obj in queryset_iterator(users): obj.influencer = None obj.save(using='production') ctr += 1 if ctr % 1000 == 0: print('%s Saved %s UserProfile models' % (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr)) print('%s Copied %s UserProfile objects.' % (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr))
def generate_report_social_urls_new_mommy(): """ Creates a csv report for collected social_urls in new_mommy_hashtags colelctions. :return: """ initial_profiles = InstagramProfile.objects.filter( tags__contains="new_mommy_hashtags", friends_count__gte=5000 ).filter( Q(social_urls_detected__isnull=False) | Q(non_social_urls_detected__isnull=False) ).order_by('id') log.info('Found %s InstagramProfiles' % initial_profiles.count()) csvfile = io.open('social_urls_detected__mommy__%s.csv' % datetime.datetime.strftime( datetime.datetime.now(), '%Y-%m-%d_%H%M%S'), 'w+', encoding='utf-8') csvfile.write( u'InstagramProfile id\turl\tDescription\tExternal url\tsocial_urls_detected\tnon_social_urls_detected\tPlatforms found\tFirst 10 platform ids\tIC TAG\tDiscovered Influencer Id\tBlog Url\t\n' ) for profile in queryset_iterator(initial_profiles): desc = profile.get_description_from_api() if desc is not None: desc = desc.replace(u'\t', u'').replace(u'\n', u'') found_plat_ids = profile.get_platform_ids_detected() ic_tags = profile.tags.split() if profile.tags is not None else [] ic_tags = [ t for t in ic_tags if t.startswith('IC_') ] csvfile.write( u'%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\n' % ( profile.id, profile.get_url(), desc, profile.get_url_from_api(), profile.social_urls_detected, profile.non_social_urls_detected, len(found_plat_ids), '' if len(found_plat_ids) == 0 else found_plat_ids[:10], ic_tags, profile.discovered_influencer.id if profile.discovered_influencer is not None else None, profile.discovered_influencer.blog_url if profile.discovered_influencer is not None else None, ) ) csvfile.close()
def classify_queryset(self, queryset=None, category=None, to_tag=True, **kwargs): """ Helper method. Source_queryset should be a queryset for InstagramProfiles. Same as above but performs the whole queryset. Could return a dict of pairs ' id: classification_value ' or a queryset object with excluding by ids. Example: We want to filter queryset so only bloggers should remain: we call the function as cs.classify_queryset(source_queryset=qs, category='blogger') :param to_tag if set True, then category tag will be set to these profiles """ if category not in self.AVAILABLE_CATEGORIES: return queryset profiles = queryset_iterator(queryset) ids = set() for profile in profiles: if self.classify_unit(profile) == category: ids.add(profile.id) # setting tag for classified profiles if to_tag: profile.append_mutual_exclusive_tag( category, self.AVAILABLE_CATEGORIES) # creating a SocialProfileOp object for this event SocialProfileOp.objects.create( profile_id=profile.id, description=category, module_classname=type(self).__name__, data={}) return queryset.filter(id__in=ids)
def copy_influencers(): print('%s Copying Influencer (!)...' % datetime.datetime.now().strftime("[%H:%M:%S]")) users = Influencer.objects.using('default').filter( show_on_search=True).exclude(blacklisted=True).exclude( blog_url__contains='artificial_blog') ctr = 0 for obj in queryset_iterator(users): obj.save(using='production') if obj.shelf_user: up = obj.shelf_user.userprofile up.influencer = obj.id up.save(using='production') ctr += 1 if ctr % 1000 == 0: print('%s Saved %s Influencer models' % (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr)) print('%s Copied %s Influencer objects.' % (datetime.datetime.now().strftime("[%H:%M:%S]"), ctr))
def generate_report_social_urls_have_youtube(): """ Creates a csv report for collected social_urls in new_mommy_hashtags colelctions. :return: """ initial_profiles = InstagramProfile.objects.filter( tags__contains="have_youtube", friends_count__gte=5000).filter( Q(social_urls_detected__isnull=False) | Q(non_social_urls_detected__isnull=False) ) log.info('Found %s InstagramProfiles' % initial_profiles.count()) csvfile = io.open('social_urls_detected__have_youtube__%s.csv' % datetime.datetime.strftime( datetime.datetime.now(), '%Y-%m-%d_%H%M%S'), 'w+', encoding='utf-8') csvfile.write( u'InstagramProfile id\turl\tDescription\tExternal url\tsocial_urls_detected\tnon_social_urls_detected\tPlatforms found\tFirst 10 platform ids\t\n' ) for profile in queryset_iterator(initial_profiles): desc = profile.get_description_from_api() if desc is not None: desc = desc.replace(u'\t', u'').replace(u'\n', u'') found_plat_ids = profile.get_platform_ids_detected() csvfile.write( u'%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\n' % ( profile.id, profile.get_url(), desc, profile.get_url_from_api(), profile.social_urls_detected, profile.non_social_urls_detected, len(found_plat_ids), '' if len(found_plat_ids) == 0 else found_plat_ids[:10] ) ) csvfile.close()
def hide_reporting_tab_for_new_brands(signup_threshold=None, count_only=False): from debra.models import Brands, User from social_discovery.blog_discovery import queryset_iterator signup_threshold = signup_threshold or datetime.date(2016, 2, 1) brands = Brands.objects.filter(is_subscribed=True) total = brands.count() brands_to_disable = [] for n, brand in enumerate(queryset_iterator(brands), start=1): has_old_users = brand.related_user_profiles.filter( user_profile__user__date_joined__lt=signup_threshold ).count() > 0 if not has_old_users: if not count_only: brand.flag_post_reporting_on = False brand.flag_report_roi_prediction = False brand.save() brands_to_disable.append(brand) print '* {}/{}, number of brands with hidden tab so far: {}'.format( n, total, len(brands_to_disable)) return brands_to_disable
def run_pipeline(self, data=None): """ This function runs pipeline for execution. """ if not self.PIPELINE_ROUTE or not isinstance(self.PIPELINE_ROUTE, ( list, tuple, )): log.error( ('Pipeline route is empty or incorrectly given: {}, exiting.' ).format(self.PIPELINE_ROUTE)) return if type(data) in [int, str]: queryset = InstagramProfile.objects.filter(id=data) elif isinstance(data, list): queryset = InstagramProfile.objects.filter(id__in=data) elif isinstance(data, QuerySet): queryset = data else: # TODO: Maybe fetch all profiles for the last day? queryset = InstagramProfile.objects.filter( friends_count__gte=self.DEFAULT_MINIMUM_FRIENDS_COUNT) profiles = queryset_iterator(queryset) log.info('Performing %s profiles...' % queryset.count()) for profile in profiles: crawler_task.apply_async(kwargs={ 'klass_name': self.PIPELINE_ROUTE[0], 'task_type': 'pipeline', 'profile_id': profile.id, 'route': self.PIPELINE_ROUTE, }, queue=get_queue_name_by_pipeline_step( self.PIPELINE_ROUTE[0]))
def refetch_moz_data_for_platforms(start_id=None, end_id=None, moz_access_id=None, moz_secret_key=None): """ Refetches all MOZ data for Blog non-artificial platforms. :param queryset: :return: """ # TODO: envelop it in task and schedule it to once per month from debra.models import Platform from social_discovery.blog_discovery import queryset_iterator import time ctr = 0 platforms = Platform.objects.filter( platform_name__in=Platform.BLOG_PLATFORMS, influencer__show_on_search=True, id__gte=start_id, id__lte=end_id, ).exclude( url_not_found=True ).exclude( url__startswith='http://www.theshelf.com/artificial_blog/' ).exclude( influencer__blacklisted=True ).exclude( moz_domain_authority__gte=0 ).order_by('id') # for pl in queryset_iterator(platforms): # # # if pl.influencer.score_popularity_overall is None: # # continue # # pl.refetch_moz_data() # ctr += 1 # print('%s updated platform %s (%s / %s / %s)' % ( # ctr, # pl.id, # pl.moz_domain_authority, # pl.moz_page_authority, # pl.moz_external_links, # )) # # if ctr % 1000 == 0: # log.info('Updated moz data for %s platforms' % ctr) # # time.sleep(11) for pl in queryset_iterator(platforms): # if pl.influencer.score_popularity_overall is not None: # continue pl.refetch_moz_data(moz_access_id, moz_secret_key) ctr += 1 print('%s updated platform %s (%s / %s / %s)' % ( ctr, pl.id, pl.moz_domain_authority, pl.moz_page_authority, pl.moz_external_links, )) if ctr % 1000 == 0: log.info('Updated moz data for %s platforms' % ctr) time.sleep(11) log.info('Finished updating moz data for %s platforms' % ctr)