Example #1
0
def find_and_reimport_linking_posts(slot=0):
    connection = db_util.connection_for_reading()
    cur = connection.cursor()
    cur.execute("""
    select po.url
    from debra_sponsorshipinfo si
    join debra_posts po on si.post_id=po.id
    where si.sidebar=true;
    """)
    bad_post_urls = {r[0] for r in cur}
    log.info('Got %d bad urls', len(bad_post_urls))
    infs = debra.models.Influencer.objects.filter(show_on_search=True).order_by('id')
    count = infs.count()
    num_workers = 10
    slice_val = count/num_workers
    for inf in infs[slot*slice_val:(slot+1)*slice_val]:
        log.info('Processing influencer %r', inf)
        all_posts = inf.posts_set.all()
        for post in all_posts.iterator():
            try:
                content = platformutils.iterate_resolve_shortened_urls(post.content)
                all_urls = contentfiltering.find_all_urls(content)
                log.info('Urls in post %r: %r', post, all_urls)
                for url in all_urls:
                    url = utils.remove_query_params(url)
                    if url in bad_post_urls:
                        log.warn('Bad url: %r', url)
                        post.brandinpost_set.all().delete()
                        post.products_import_completed = False
                        post.save()
            except:
                log.exception('While processing %r', post)
def get_batch(batch_size=10000):
    connection = db_util.connection_for_reading()
    cursor = connection.cursor()
    cursor.execute('SELECT post_id FROM {interactions} WHERE platform_id IS NULL LIMIT %s'.format(interactions=INTERACTIONS_TABLE),
                   [batch_size])
    post_ids = [row[0] for row in cursor.fetchall()]
    return list(set(post_ids))
def get_blog_platforms():
    connection = db_util.connection_for_reading()
    cursor = connection.cursor()
    cursor.execute("""
                   select distinct p.id from debra_platform p inner join debra_posts ps
                    on ps.platform_id = p.id
                   where p.platform_name in ('Wordpress', 'Blogspot', 'Custom')
                   """)
    rows = cursor.fetchall()
    return [row[0] for row in rows]
Example #4
0
def blacklist_platforms_with_fetch_errors():
    connection = db_util.connection_for_reading()
    cur = connection.cursor()
    cur.execute(SQL_PLATFORM_IDS_WITH_FETCH_ERRORS)
    log.info('%d plats to blacklist', cur.rowcount)
    for plat_id, in cur:
        plat = Platform.objects.get(id=plat_id)
        with platformutils.OpRecorder(operation='blacklist_platforms_with_fetch_errors',
                                      platform=plat) as opr:
            log.info('Blacklisting platform %r', plat)
            plat.url_not_found = True
            plat.save()
Example #5
0
def fetch_prods_for_posts_with_sponsorships_but_no_products():
    connection = db_util.connection_for_reading()
    cur = connection.cursor()
    cur.execute("""
    select po.id
    from debra_posts po
    where not exists(select * from debra_productmodelshelfmap pmsm where pmsm.post_id=po.id)
    and exists(select * from debra_sponsorshipinfo si where si.post_id=po.id)
    and po.products_import_completed = true
    """)
    post_ids = [row[0] for row in cur]
    _submit_fetch_prods(post_ids)
def update_batch(post_ids):
    try:
        connection = db_util.connection_for_reading()
        cursor = connection.cursor()
        post_comma_string = str(tuple(post_ids))
        # PG-specific UPDATE FROM command
        cursor.execute('''
    UPDATE {interactions}
    SET platform_id = ps.platform_id
    FROM (SELECT id, platform_id FROM debra_posts WHERE id IN {post_ids}) AS ps
    WHERE post_id IN {post_ids} AND ps.id = post_id
                    '''.format(interactions=INTERACTIONS_TABLE, post_ids=post_comma_string))
    except:
        traceback.print_exc()
Example #7
0
def fetch_prods_for_posts_with_liketk_but_no_products():
    connection = db_util.connection_for_reading()
    cur = connection.cursor()
    cur.execute("""
    select po.id
    from debra_posts po
    join debra_platform pl on po.platform_id=pl.id
    where not exists(select * from debra_productmodelshelfmap pmsm where pmsm.post_id=po.id)
    and content ilike '%liketk.it%'
    and po.products_import_completed=true
    and pl.platform_name='Instagram'
    """)
    post_ids = [row[0] for row in cur]
    _submit_fetch_prods(post_ids)
Example #8
0
def get_influencer_batch():
    '''
    previously prepared influencers as:

create table tmp_influencer_show_on_search as select id, show_on_search from debra_influencer where show_on_search is not null;
create index tmp_influencer_show_on_search_influencer_id on tmp_influencer_show_on_search(id);

    HACK: this selects and updates show_on_search = False influencers to avoid joins. To handle all values we need to do the
    same for show_on_search = True ones too.
    '''
    connection = db_util.connection_for_reading()
    cursor = connection.cursor()
    cursor.execute('SELECT id FROM tmp_influencer_show_on_search WHERE show_on_search = false LIMIT 10')
    return [row[0] for row in cursor.fetchall()]
Example #9
0
def fetch_count_and_objects(sql, mclass, id_alias='pl.id'):
    """A sql should contain '{what}' in a place of columns selection.
    This function returns a tuple of
    - 'count(*)'
    - a list of ``mclass`` model class objects
    """
    connection = db_util.connection_for_reading()
    cur = connection.cursor()

    cur.execute(sql.format(what='count(*)'))
    count = cur.fetchone()[0]

    cur.execute('%s limit %s' % (sql.format(what=id_alias), RES_LIMIT))
    objects = from_ids(cur, mclass)

    return count, objects
Example #10
0
def most_popular_brands(how_many):
    connection = db_util.connection_for_reading()
    cur = connection.cursor()
    cur.execute(
        """select pm.brand_id, count(*) as brand_popularity
        from debra_productmodelshelfmap pmsm
        join debra_productmodel pm on pm.id=pmsm.product_model_id
        group by pm.brand_id
        order by brand_popularity desc
        limit %s""", [how_many])
    brands_ids = [row[0] for row in cur]
    cur.close()
    brands_objects = [
        debra.models.Brands.objects.get(id=b_id) for b_id in brands_ids
    ]
    return brands_objects
Example #11
0
def create_blog_platforms_for_source_google_when_missing():
    from debra import helpers

    connection = db_util.connection_for_reading()
    cur = connection.cursor()
    cur.execute("""
    select inf.id
    from debra_influencer inf
    left join debra_platform pl on pl.influencer_id=inf.id and (pl.platform_name in ('Blogspot', 'Wordpress', 'Custom') or pl.platform_name is null)
    where inf.blacklisted = False
    and inf.source = 'google'
    and pl.id is null;
    """)
    inf_ids = cur.fetchall()
    log.info('%d infs without a blog platform', len(inf_ids))
    for inf_id, in inf_ids:
        inf = models.Influencer.objects.get(id=inf_id)
        helpers.create_blog_platform_for_blog_url(inf)
Example #12
0
 def report_all(self, platform_name):
     """
     Run this for each type of platform
     """
     cause_str = "CAUSE_SUSPECT_DUPLICATE_SOCIAL_%s" % platform_name
     cause = getattr(models.InfluencerCheck, cause_str)
     connection = db_util.connection_for_reading()
     cur = connection.cursor()
     cur.execute("""
     select distinct inf1.id, inf2.id, pl1.id
     from debra_platform pl1, debra_platform pl2, debra_influencer inf1, debra_influencer inf2
     where pl1.url = pl2.url
     and pl1.url <> ''
     and pl1.id < pl2.id
     and pl1.url_not_found=false
     and pl2.url_not_found=false
     and pl1.platform_name = '{platform_name}'
     and pl2.platform_name = pl1.platform_name
     and inf1.id=pl1.influencer_id
     and inf2.id=pl2.influencer_id
     and inf1.blacklisted=false and inf1.source is not null and inf1.validated_on like '%%info%%'
                 and inf1.show_on_search=true
     and inf2.blacklisted=false and inf2.source is not null and inf2.validated_on like '%%info%%'
                 and inf2.show_on_search=true
     """.format(platform_name=platform_name))
     log.info('Fetching %d duplicate pairs', cur.rowcount)
     for inf1_id, inf2_id, pl1_id in cur:
         models.InfluencerCheck.report_new(
             models.Influencer.objects.get(id=inf1_id),
             models.Platform.objects.get(id=pl1_id),
             cause,
             [],
             data={'related': [['Influencer', inf2_id]]},
         )
         models.InfluencerCheck.report_new(
             models.Influencer.objects.get(id=inf2_id),
             models.Platform.objects.get(id=pl1_id),
             cause,
             [],
             data={'related': [['Influencer', inf1_id]]},
         )
Example #13
0
 def check(self):
     connection = db_util.connection_for_reading()
     cur = connection.cursor()
     cur.execute("""
     select
     count(case when error_msg is not null and error_msg <> 'old_version' then 1 else null end) as errors,
     count(*) as all
     from debra_platformdataop pdo
     join debra_platform pl on pl.id=pdo.platform_id
     join debra_influencer inf on inf.id=pl.influencer_id
     where pdo.operation='fetch_data'
     and inf.show_on_search = true
     and not (pl.url_not_found = true)
     and pdo.started > current_timestamp - '{hours} hours'::interval""".
                 format(hours=self.hours))
     errors, all = cur.fetchone()
     if all == 0:
         errors_pct = 0
     else:
         errors_pct = (errors * 100) / all
     print 'Errors in the last %s hours: %.2f%%' % (self.hours, errors_pct)
Example #14
0
def to_recalculate_is_active():
    q = '''
SELECT DISTINCT i.id
FROM debra_influencer i
INNER JOIN
  (SELECT inf.id,
          max(inserted_datetime) AS last_post
   FROM debra_influencer inf
   INNER JOIN debra_platform p ON p.influencer_id = inf.id
   INNER JOIN debra_posts ps ON ps.platform_id = p.id
   WHERE p.platform_name IN ('Wordpress',
                             'Blogspot',
                             'Custom')
   GROUP BY inf.id) ips ON i.id = ips.id
WHERE ips.last_post > now() - '90 days'::interval AND
    i.is_active = 'f'::bool AND
    i.source IS NOT NULL AND
    i.blog_url IS NOT NULL
    '''
    connection = db_util.connection_for_reading()
    c = connection.cursor()
    c.execute(q)
    rows = c.fetchall()
    return [row[0] for row in rows]
Example #15
0
def fix_duplicates_by_social_platform():
    connection = db_util.connection_for_reading()
    cur = connection.cursor()
    cur.execute("""
select distinct inf1.id, inf2.id
from debra_platform pl1, debra_platform pl2, debra_influencer inf1, debra_influencer inf2
where pl1.url = pl2.url
and pl1.url <> ''
and pl1.id < pl2.id
and pl1.url_not_found=false
and pl2.url_not_found=false
and pl1.platform_name in ('Facebook', 'Twitter', 'Instagram', 'Pinterest')
and pl2.platform_name = pl1.platform_name
and inf1.id=pl1.influencer_id
and inf2.id=pl2.influencer_id
and inf1.blacklisted=false and inf1.source is not null and inf1.validated_on like '%%info%%'
                and inf1.show_on_search=true and  {inf1_active}
and inf2.blacklisted=false and inf2.source is not null and inf2.validated_on like '%%info%%' and inf2.show_on_search=true
                and {inf2_active}
    """.format(inf1_active=models.InfluencerQuerySet.active_sql('inf1'),
               inf2_active=models.InfluencerQuerySet.active_sql('inf2')))
    log.info('Fetching %d duplicate pairs', cur.rowcount)
    for inf1_id, inf2_id in cur:
        _handle_dup_pair(inf1_id, inf2_id)
Example #16
0
def insert_warnings(q, invariant_instance, mclass, id_alias='pl.id'):
    if isinstance(q, (str, unicode)):
        # sql
        connection = db_util.connection_for_reading()
        cur = connection.cursor()
        cur.execute(q.format(what=id_alias))
        r_ids = (row[0] for row in cur)
    elif isinstance(q, query.QuerySet):
        r_ids = (m.id for m in q.iterator())
    elif isinstance(q, list):
        r_ids = (m.id for m in q)
    else:
        assert False, 'Unknown type of q: %r' % type(q)
    cause = getattr(invariant_instance, 'cause', None)
    if not cause:
        log.error('cause not specified in %r', invariant_instance)
        return
    fields = getattr(invariant_instance, 'fields', None)
    if not fields:
        fields = []
    for r_id in r_ids:
        if mclass == models.Platform:
            platform = models.Platform.objects.get(id=r_id)
            influencer = platform.influencer
        elif mclass == models.Influencer:
            influencer = models.Influencer.objects.get(id=r_id)
            platform = None
        else:
            assert False, 'Unknown model class %r' % mclass
        if not models.InfluencerCheck.objects.filter(
                influencer=influencer,
                platform=platform,
                cause=cause,
                fields=fields,
                status=models.InfluencerCheck.STATUS_NEW).exists():
            models.InfluencerCheck.report(influencer, platform, cause, fields)
def get_all_platforms():
    connection = db_util.connection_for_reading()
    cursor = connection.cursor()
    cursor.execute("select p.id from debra_platform as p")
    rows = cursor.fetchall()
    return [row[0] for row in rows]
Example #18
0
def force_db_indexes_usage():
    from debra import db_util
    connection = db_util.connection_for_reading()
    cur = connection.cursor()
    cur.execute("set enable_seqscan = false")