def find_and_reimport_linking_posts(slot=0): connection = db_util.connection_for_reading() cur = connection.cursor() cur.execute(""" select po.url from debra_sponsorshipinfo si join debra_posts po on si.post_id=po.id where si.sidebar=true; """) bad_post_urls = {r[0] for r in cur} log.info('Got %d bad urls', len(bad_post_urls)) infs = debra.models.Influencer.objects.filter(show_on_search=True).order_by('id') count = infs.count() num_workers = 10 slice_val = count/num_workers for inf in infs[slot*slice_val:(slot+1)*slice_val]: log.info('Processing influencer %r', inf) all_posts = inf.posts_set.all() for post in all_posts.iterator(): try: content = platformutils.iterate_resolve_shortened_urls(post.content) all_urls = contentfiltering.find_all_urls(content) log.info('Urls in post %r: %r', post, all_urls) for url in all_urls: url = utils.remove_query_params(url) if url in bad_post_urls: log.warn('Bad url: %r', url) post.brandinpost_set.all().delete() post.products_import_completed = False post.save() except: log.exception('While processing %r', post)
def get_batch(batch_size=10000): connection = db_util.connection_for_reading() cursor = connection.cursor() cursor.execute('SELECT post_id FROM {interactions} WHERE platform_id IS NULL LIMIT %s'.format(interactions=INTERACTIONS_TABLE), [batch_size]) post_ids = [row[0] for row in cursor.fetchall()] return list(set(post_ids))
def get_blog_platforms(): connection = db_util.connection_for_reading() cursor = connection.cursor() cursor.execute(""" select distinct p.id from debra_platform p inner join debra_posts ps on ps.platform_id = p.id where p.platform_name in ('Wordpress', 'Blogspot', 'Custom') """) rows = cursor.fetchall() return [row[0] for row in rows]
def blacklist_platforms_with_fetch_errors(): connection = db_util.connection_for_reading() cur = connection.cursor() cur.execute(SQL_PLATFORM_IDS_WITH_FETCH_ERRORS) log.info('%d plats to blacklist', cur.rowcount) for plat_id, in cur: plat = Platform.objects.get(id=plat_id) with platformutils.OpRecorder(operation='blacklist_platforms_with_fetch_errors', platform=plat) as opr: log.info('Blacklisting platform %r', plat) plat.url_not_found = True plat.save()
def fetch_prods_for_posts_with_sponsorships_but_no_products(): connection = db_util.connection_for_reading() cur = connection.cursor() cur.execute(""" select po.id from debra_posts po where not exists(select * from debra_productmodelshelfmap pmsm where pmsm.post_id=po.id) and exists(select * from debra_sponsorshipinfo si where si.post_id=po.id) and po.products_import_completed = true """) post_ids = [row[0] for row in cur] _submit_fetch_prods(post_ids)
def update_batch(post_ids): try: connection = db_util.connection_for_reading() cursor = connection.cursor() post_comma_string = str(tuple(post_ids)) # PG-specific UPDATE FROM command cursor.execute(''' UPDATE {interactions} SET platform_id = ps.platform_id FROM (SELECT id, platform_id FROM debra_posts WHERE id IN {post_ids}) AS ps WHERE post_id IN {post_ids} AND ps.id = post_id '''.format(interactions=INTERACTIONS_TABLE, post_ids=post_comma_string)) except: traceback.print_exc()
def fetch_prods_for_posts_with_liketk_but_no_products(): connection = db_util.connection_for_reading() cur = connection.cursor() cur.execute(""" select po.id from debra_posts po join debra_platform pl on po.platform_id=pl.id where not exists(select * from debra_productmodelshelfmap pmsm where pmsm.post_id=po.id) and content ilike '%liketk.it%' and po.products_import_completed=true and pl.platform_name='Instagram' """) post_ids = [row[0] for row in cur] _submit_fetch_prods(post_ids)
def get_influencer_batch(): ''' previously prepared influencers as: create table tmp_influencer_show_on_search as select id, show_on_search from debra_influencer where show_on_search is not null; create index tmp_influencer_show_on_search_influencer_id on tmp_influencer_show_on_search(id); HACK: this selects and updates show_on_search = False influencers to avoid joins. To handle all values we need to do the same for show_on_search = True ones too. ''' connection = db_util.connection_for_reading() cursor = connection.cursor() cursor.execute('SELECT id FROM tmp_influencer_show_on_search WHERE show_on_search = false LIMIT 10') return [row[0] for row in cursor.fetchall()]
def fetch_count_and_objects(sql, mclass, id_alias='pl.id'): """A sql should contain '{what}' in a place of columns selection. This function returns a tuple of - 'count(*)' - a list of ``mclass`` model class objects """ connection = db_util.connection_for_reading() cur = connection.cursor() cur.execute(sql.format(what='count(*)')) count = cur.fetchone()[0] cur.execute('%s limit %s' % (sql.format(what=id_alias), RES_LIMIT)) objects = from_ids(cur, mclass) return count, objects
def most_popular_brands(how_many): connection = db_util.connection_for_reading() cur = connection.cursor() cur.execute( """select pm.brand_id, count(*) as brand_popularity from debra_productmodelshelfmap pmsm join debra_productmodel pm on pm.id=pmsm.product_model_id group by pm.brand_id order by brand_popularity desc limit %s""", [how_many]) brands_ids = [row[0] for row in cur] cur.close() brands_objects = [ debra.models.Brands.objects.get(id=b_id) for b_id in brands_ids ] return brands_objects
def create_blog_platforms_for_source_google_when_missing(): from debra import helpers connection = db_util.connection_for_reading() cur = connection.cursor() cur.execute(""" select inf.id from debra_influencer inf left join debra_platform pl on pl.influencer_id=inf.id and (pl.platform_name in ('Blogspot', 'Wordpress', 'Custom') or pl.platform_name is null) where inf.blacklisted = False and inf.source = 'google' and pl.id is null; """) inf_ids = cur.fetchall() log.info('%d infs without a blog platform', len(inf_ids)) for inf_id, in inf_ids: inf = models.Influencer.objects.get(id=inf_id) helpers.create_blog_platform_for_blog_url(inf)
def report_all(self, platform_name): """ Run this for each type of platform """ cause_str = "CAUSE_SUSPECT_DUPLICATE_SOCIAL_%s" % platform_name cause = getattr(models.InfluencerCheck, cause_str) connection = db_util.connection_for_reading() cur = connection.cursor() cur.execute(""" select distinct inf1.id, inf2.id, pl1.id from debra_platform pl1, debra_platform pl2, debra_influencer inf1, debra_influencer inf2 where pl1.url = pl2.url and pl1.url <> '' and pl1.id < pl2.id and pl1.url_not_found=false and pl2.url_not_found=false and pl1.platform_name = '{platform_name}' and pl2.platform_name = pl1.platform_name and inf1.id=pl1.influencer_id and inf2.id=pl2.influencer_id and inf1.blacklisted=false and inf1.source is not null and inf1.validated_on like '%%info%%' and inf1.show_on_search=true and inf2.blacklisted=false and inf2.source is not null and inf2.validated_on like '%%info%%' and inf2.show_on_search=true """.format(platform_name=platform_name)) log.info('Fetching %d duplicate pairs', cur.rowcount) for inf1_id, inf2_id, pl1_id in cur: models.InfluencerCheck.report_new( models.Influencer.objects.get(id=inf1_id), models.Platform.objects.get(id=pl1_id), cause, [], data={'related': [['Influencer', inf2_id]]}, ) models.InfluencerCheck.report_new( models.Influencer.objects.get(id=inf2_id), models.Platform.objects.get(id=pl1_id), cause, [], data={'related': [['Influencer', inf1_id]]}, )
def check(self): connection = db_util.connection_for_reading() cur = connection.cursor() cur.execute(""" select count(case when error_msg is not null and error_msg <> 'old_version' then 1 else null end) as errors, count(*) as all from debra_platformdataop pdo join debra_platform pl on pl.id=pdo.platform_id join debra_influencer inf on inf.id=pl.influencer_id where pdo.operation='fetch_data' and inf.show_on_search = true and not (pl.url_not_found = true) and pdo.started > current_timestamp - '{hours} hours'::interval""". format(hours=self.hours)) errors, all = cur.fetchone() if all == 0: errors_pct = 0 else: errors_pct = (errors * 100) / all print 'Errors in the last %s hours: %.2f%%' % (self.hours, errors_pct)
def to_recalculate_is_active(): q = ''' SELECT DISTINCT i.id FROM debra_influencer i INNER JOIN (SELECT inf.id, max(inserted_datetime) AS last_post FROM debra_influencer inf INNER JOIN debra_platform p ON p.influencer_id = inf.id INNER JOIN debra_posts ps ON ps.platform_id = p.id WHERE p.platform_name IN ('Wordpress', 'Blogspot', 'Custom') GROUP BY inf.id) ips ON i.id = ips.id WHERE ips.last_post > now() - '90 days'::interval AND i.is_active = 'f'::bool AND i.source IS NOT NULL AND i.blog_url IS NOT NULL ''' connection = db_util.connection_for_reading() c = connection.cursor() c.execute(q) rows = c.fetchall() return [row[0] for row in rows]
def fix_duplicates_by_social_platform(): connection = db_util.connection_for_reading() cur = connection.cursor() cur.execute(""" select distinct inf1.id, inf2.id from debra_platform pl1, debra_platform pl2, debra_influencer inf1, debra_influencer inf2 where pl1.url = pl2.url and pl1.url <> '' and pl1.id < pl2.id and pl1.url_not_found=false and pl2.url_not_found=false and pl1.platform_name in ('Facebook', 'Twitter', 'Instagram', 'Pinterest') and pl2.platform_name = pl1.platform_name and inf1.id=pl1.influencer_id and inf2.id=pl2.influencer_id and inf1.blacklisted=false and inf1.source is not null and inf1.validated_on like '%%info%%' and inf1.show_on_search=true and {inf1_active} and inf2.blacklisted=false and inf2.source is not null and inf2.validated_on like '%%info%%' and inf2.show_on_search=true and {inf2_active} """.format(inf1_active=models.InfluencerQuerySet.active_sql('inf1'), inf2_active=models.InfluencerQuerySet.active_sql('inf2'))) log.info('Fetching %d duplicate pairs', cur.rowcount) for inf1_id, inf2_id in cur: _handle_dup_pair(inf1_id, inf2_id)
def insert_warnings(q, invariant_instance, mclass, id_alias='pl.id'): if isinstance(q, (str, unicode)): # sql connection = db_util.connection_for_reading() cur = connection.cursor() cur.execute(q.format(what=id_alias)) r_ids = (row[0] for row in cur) elif isinstance(q, query.QuerySet): r_ids = (m.id for m in q.iterator()) elif isinstance(q, list): r_ids = (m.id for m in q) else: assert False, 'Unknown type of q: %r' % type(q) cause = getattr(invariant_instance, 'cause', None) if not cause: log.error('cause not specified in %r', invariant_instance) return fields = getattr(invariant_instance, 'fields', None) if not fields: fields = [] for r_id in r_ids: if mclass == models.Platform: platform = models.Platform.objects.get(id=r_id) influencer = platform.influencer elif mclass == models.Influencer: influencer = models.Influencer.objects.get(id=r_id) platform = None else: assert False, 'Unknown model class %r' % mclass if not models.InfluencerCheck.objects.filter( influencer=influencer, platform=platform, cause=cause, fields=fields, status=models.InfluencerCheck.STATUS_NEW).exists(): models.InfluencerCheck.report(influencer, platform, cause, fields)
def get_all_platforms(): connection = db_util.connection_for_reading() cursor = connection.cursor() cursor.execute("select p.id from debra_platform as p") rows = cursor.fetchall() return [row[0] for row in rows]
def force_db_indexes_usage(): from debra import db_util connection = db_util.connection_for_reading() cur = connection.cursor() cur.execute("set enable_seqscan = false")