Ejemplo n.º 1
0
def update_first_occured_agregates():

    missing_crawls = query_to_tuples(
        """select id from main_crawl p where p.success = true and not exists (select crawl_id from main_hitgroupfirstoccurences where crawl_id = p.id );"""
    )

    for row in missing_crawls:

        crawl_id = row[0]
        logging.info(
            "inserting missing crawl into main_hitgroupfirstoccurences: %s" %
            crawl_id)

        execute_sql("""INSERT INTO
                main_hitgroupfirstoccurences
                    select
                        p.reward,
                        p.id,
                        q.crawl_id,
                        p.requester_name,
                        q.id,
                        p.occurrence_date,
                        p.requester_id,
                        p.group_id,
                        q.hits_available,
                        nextval('main_hitgroupfirstoccurences_id_seq'::regclass)
                    from main_hitgroupcontent p join main_hitgroupstatus q
                        on( p.first_crawl_id = q.crawl_id and q.hit_group_content_id = p.id )
                        where q.crawl_id = %s;""" % crawl_id)

        execute_sql('commit;')
Ejemplo n.º 2
0
def update_first_occured_agregates():

    missing_crawls = query_to_tuples("""select id from main_crawl p where p.success = true and not exists (select crawl_id from main_hitgroupfirstoccurences where crawl_id = p.id );""")

    for row in missing_crawls:

        crawl_id = row[0]
        logging.info("inserting missing crawl into main_hitgroupfirstoccurences: %s" % crawl_id)

        execute_sql("""INSERT INTO
                main_hitgroupfirstoccurences
                    select
                        p.reward,
                        p.id,
                        q.crawl_id,
                        p.requester_name,
                        q.id,
                        p.occurrence_date,
                        p.requester_id,
                        p.group_id,
                        q.hits_available,
                        nextval('main_hitgroupfirstoccurences_id_seq'::regclass)
                    from main_hitgroupcontent p join main_hitgroupstatus q
                        on( p.first_crawl_id = q.crawl_id and q.hit_group_content_id = p.id )
                        where q.crawl_id = %s;""" % crawl_id)

        execute_sql('commit;')
Ejemplo n.º 3
0
def topreq_data(days):
    from tenclouds.sql import query_to_tuples, execute_sql
    start_time = time.time()
    firstcrawl = execute_sql("""
        SELECT crawl_id
        FROM hits_mv
        WHERE
            start_time > %s
        ORDER BY start_time ASC
        LIMIT 1;""", datetime.date.today() - datetime.timedelta(int(days))).fetchall()[0][0]

    return list(query_to_tuples("""
        SELECT
            h.requester_id,
            h.requester_name,
            count(*) as "projects",
            sum(mv.hits_available) as "hits",
            sum(mv.hits_available*h.reward) as "reward",
            max(h.occurrence_date) as "last_posted"
        FROM
                main_hitgroupcontent h
                LEFT JOIN main_requesterprofile p ON h.requester_id = p.requester_id
                LEFT JOIN (
                    SELECT group_id, crawl_id, hits_available from
                    hits_mv where crawl_id> %s
                ) mv ON (h.group_id=mv.group_id and h.first_crawl_id=mv.crawl_id)
            WHERE
                h.first_crawl_id > %s
                AND coalesce(p.is_public, true) = true
            group by h.requester_id, h.requester_name
            order by sum(mv.hits_available*h.reward) desc
            limit 1000;""" % (firstcrawl, firstcrawl)))
Ejemplo n.º 4
0
def topreq_data(days):
    from tenclouds.sql import query_to_tuples, execute_sql
    start_time = time.time()
    firstcrawl = execute_sql(
        """
        SELECT crawl_id
        FROM hits_mv
        WHERE
            start_time > %s
        ORDER BY start_time ASC
        LIMIT 1;""",
        datetime.date.today() - datetime.timedelta(int(days))).fetchall()[0][0]

    return list(
        query_to_tuples("""
        SELECT
            h.requester_id,
            h.requester_name,
            count(*) as "projects",
            sum(mv.hits_available) as "hits",
            sum(mv.hits_available*h.reward) as "reward",
            max(h.occurrence_date) as "last_posted"
        FROM
                main_hitgroupcontent h
                LEFT JOIN main_requesterprofile p ON h.requester_id = p.requester_id
                LEFT JOIN (
                    SELECT group_id, crawl_id, hits_available from
                    hits_mv where crawl_id> %s
                ) mv ON (h.group_id=mv.group_id and h.first_crawl_id=mv.crawl_id)
            WHERE
                h.first_crawl_id > %s
                AND coalesce(p.is_public, true) = true
            group by h.requester_id, h.requester_name
            order by sum(mv.hits_available*h.reward) desc
            limit 1000;""" % (firstcrawl, firstcrawl)))
Ejemplo n.º 5
0
def top_requesters(request):
    def row_formatter(input):
        "Yield formatted rows"
        for cc in input:
            row = []
            url = reverse('requester_details', kwargs={'requester_id': cc[0]})
            row.append('<a href="%s">%s</a>' % (url, cc[1]))
            row.append(
                '<a href="https://www.mturk.com/mturk/searchbar?requesterId=%s" target="_mturk">%s</a> (<a href="http://feed.crowdsauced.com/r/req/%s">RSS</a>)'
                % (cc[0], cc[0], cc[0]))
            row.extend(cc[2:6])
            url = reverse('admin-toggle-requester-status', args=(cc[0], ))
            row.append('<a href="%s">%s</a>' %
                       (url, cc[6] and 'public' or 'private'))
            yield row

    date_30_days_before = datetime.date.today() - datetime.timedelta(days=30)
    data = row_formatter(
        query_to_tuples(
            '''
            SELECT
                h.requester_id,
                h.requester_name,
                count(*) as "projects",
                sum(h.hits_available) as "hits",
                sum(h.hits_available*reward) as "reward",
                max(h.occurrence_date) as "last_posted",
                coalesce(p.is_public, true) as is_public
            FROM
                main_hitgroupfirstoccurences h
                    LEFT JOIN main_requesterprofile p ON h.requester_id = p.requester_id
            WHERE
                h.occurrence_date > %s
            GROUP BY
                h.requester_id, h.requester_name, p.is_public
            ORDER BY
                sum(h.hits_available*reward)
                DESC
            LIMIT 1000;
            ''', date_30_days_before))

    columns = (
        ('string', 'Requester ID'),
        ('string', 'Requester'),
        ('number', '#Task'),
        ('number', '#HITs'),
        ('number', 'Rewards'),
        ('datetime', 'Last Posted On'),
        ('string', 'Status'),
    )
    ctx = {
        'data': data,
        'columns': columns,
        'title': 'Top-1000 Recent Requesters',
    }
    return direct_to_template(request, 'main/graphs/table.html', ctx)
Ejemplo n.º 6
0
def top_requesters(request):

    def row_formatter(input):
        "Yield formatted rows"
        for cc in input:
            row = []
            url = reverse('requester_details', kwargs={'requester_id':cc[0]})
            row.append('<a href="%s">%s</a>' % (url, cc[1]))
            row.append('<a href="https://www.mturk.com/mturk/searchbar?requesterId=%s" target="_mturk">%s</a> (<a href="http://feed.crowdsauced.com/r/req/%s">RSS</a>)'
                       % (cc[0],cc[0],cc[0]) )
            row.extend(cc[2:6])
            url = reverse('admin-toggle-requester-status', args=(cc[0], ))
            row.append('<a href="%s">%s</a>' % (url, cc[6] and 'public' or 'private'))
            yield row

    date_30_days_before = datetime.date.today() - datetime.timedelta(days=30)
    data = row_formatter(query_to_tuples('''
            SELECT
                h.requester_id,
                h.requester_name,
                count(*) as "projects",
                sum(h.hits_available) as "hits",
                sum(h.hits_available*reward) as "reward",
                max(h.occurrence_date) as "last_posted",
                coalesce(p.is_public, true) as is_public
            FROM
                main_hitgroupfirstoccurences h
                    LEFT JOIN main_requesterprofile p ON h.requester_id = p.requester_id
            WHERE
                h.occurrence_date > %s
            GROUP BY
                h.requester_id, h.requester_name, p.is_public
            ORDER BY
                sum(h.hits_available*reward)
                DESC
            LIMIT 1000;
            ''', date_30_days_before))

    columns = (
        ('string','Requester ID'),
        ('string','Requester'),
        ('number','#Task'),
        ('number','#HITs'),
        ('number','Rewards'),
        ('datetime', 'Last Posted On'),
        ('string', 'Status'),
    )
    ctx = {
        'data': data,
        'columns': columns,
        'title': 'Top-1000 Recent Requesters',
    }
    return direct_to_template(request, 'main/graphs/table.html', ctx)
Ejemplo n.º 7
0
def requester_details(request, requester_id):
    def row_formatter(input):
        for cc in input:
            row = []
            url = reverse('hit_group_details', kwargs={'hit_group_id': cc[5]})
            row.append('<a href="%s">%s</a>' % (url, cc[0]))
            row.extend(cc[1:5])
            url = reverse('admin-toggle-hitgroup-status', args=(cc[5], ))
            row.append('<a href="%s">%s</a>' %
                       (url, cc[6] and 'public' or 'private'))
            yield row

    requester_name = HitGroupContent.objects.filter(requester_id=requester_id)
    requester_name = requester_name.values_list('requester_name',
                                                flat=True).distinct()

    if requester_name:
        requester_name = requester_name[0]
    else:
        requester_name = requester_id

    date_from = datetime.date.today() - datetime.timedelta(days=30)
    data = query_to_tuples(
        """
        SELECT
            title, hits_available, p.reward, p.occurrence_date,
            (SELECT end_time FROM main_crawl WHERE id = (SELECT max(crawl_id) FROM main_hitgroupstatus WHERE group_id = q.group_id AND hit_group_content_id = p.group_content_id)) - p.occurrence_date, p.group_id, q.is_public
        FROM
            main_hitgroupfirstoccurences p
                JOIN main_hitgroupcontent q ON (p.group_content_id = q.id AND p.requester_id = q.requester_id)
        WHERE
            p.requester_id = %s
            AND p.occurrence_date > %s
            AND q.occurrence_date > %s
        """, requester_id, date_from, date_from)

    columns = (
        ('string', 'HIT Title'),
        ('number', '#HITs'),
        ('number', 'Reward'),
        ('datetime', 'Posted'),
        ('number', 'Duration (Days)'),
        ('string', 'Status'),
    )

    ctx = {
        'data': text_row_formater(row_formatter(data)),
        'columns': tuple(columns),
        'title': 'Last 100 Tasks posted by %s' % requester_name,
        'user': request.user,
    }
    return direct_to_template(request, 'main/requester_details.html', ctx)
Ejemplo n.º 8
0
def requester_details(request, requester_id):
    def row_formatter(input):
        for cc in input:
            row = []
            url = reverse('hit_group_details',kwargs={'hit_group_id':cc[5]})
            row.append('<a href="%s">%s</a>' % (url, cc[0]))
            row.extend(cc[1:5])
            url = reverse('admin-toggle-hitgroup-status', args=(cc[5],))
            row.append('<a href="%s">%s</a>' % (url, cc[6] and 'public' or 'private'))
            yield row

    requester_name = HitGroupContent.objects.filter(requester_id=requester_id)
    requester_name = requester_name.values_list('requester_name',flat=True).distinct()

    if requester_name:
        requester_name = requester_name[0]
    else:
        requester_name = requester_id

    date_from = datetime.date.today() - datetime.timedelta(days=30)
    data = query_to_tuples("""
        SELECT
            title, hits_available, p.reward, p.occurrence_date,
            (SELECT end_time FROM main_crawl WHERE id = (SELECT max(crawl_id) FROM main_hitgroupstatus WHERE group_id = q.group_id AND hit_group_content_id = p.group_content_id)) - p.occurrence_date, p.group_id, q.is_public
        FROM
            main_hitgroupfirstoccurences p
                JOIN main_hitgroupcontent q ON (p.group_content_id = q.id AND p.requester_id = q.requester_id)
        WHERE
            p.requester_id = %s
            AND p.occurrence_date > %s
            AND q.occurrence_date > %s
        """, requester_id, date_from, date_from)

    columns = (
        ('string', 'HIT Title'),
        ('number', '#HITs'),
        ('number', 'Reward'),
        ('datetime', 'Posted'),
        ('number', 'Duration (Days)'),
        ('string', 'Status'),
    )

    ctx = {
        'data': text_row_formater(row_formatter(data)),
        'columns': tuple(columns),
        'title': 'Last 100 Tasks posted by %s' % requester_name,
        'user': request.user,
    }
    return direct_to_template(request, 'main/requester_details.html', ctx)
Ejemplo n.º 9
0
    def _requester_details(request, requester_id):
        def row_formatter(input):

            for cc in input:
                row = []
                row.append('<a href="%s">%s</a>' %
                           (reverse('hit_group_details',
                                    kwargs={'hit_group_id': cc[3]}), cc[0]))
                row.extend(cc[1:3])
                yield row

        requster_name = HitGroupContent.objects.filter(
            requester_id=requester_id).values_list('requester_name',
                                                   flat=True).distinct()

        if requster_name: requster_name = requster_name[0]
        else: requster_name = requester_id

        date_from = (datetime.date.today() -
                     datetime.timedelta(days=30)).isoformat()

        data = query_to_tuples("""
    select
        title,
        p.reward,
        p.occurrence_date,
        p.group_id
    from main_hitgroupcontent p
        LEFT JOIN main_requesterprofile r ON p.requester_id = r.requester_id
    where
        p.requester_id = '%s'
        AND coalesce(r.is_public, true) = true
        and
        p.occurrence_date > TIMESTAMP '%s';
        """ % (requester_id, date_from))

        columns = [
            ('string', 'HIT Title'),
            ('number', 'Reward'),
            ('datetime', 'Posted'),
        ]
        ctx = {
            'data': text_row_formater(row_formatter(data)),
            'columns': tuple(columns),
            'title':
            'Tasks posted during last 30 days by %s' % (requster_name),
            'user': request.user,
        }
        return direct_to_template(request, 'main/requester_details.html', ctx)
Ejemplo n.º 10
0
def update_cid(cid):

    st = time.time()
    count = 0
    for i, g in enumerate(query_to_tuples("select distinct group_id from hits_mv where crawl_id = %s", cid)):
        g = g[0]
        log.info("processing %s, %s %s", i, cid,  g)
        updatehitgroup(g, cid)
        count += 1

    execute_sql("commit;")

    log.info("updated crawl in %s", time.time() - st)

    return count
Ejemplo n.º 11
0
def update_cid(cid):

    st = time.time()
    count = 0
    for i, g in enumerate(
            query_to_tuples(
                "select distinct group_id from hits_mv where crawl_id = %s",
                cid)):
        g = g[0]
        log.info("processing %s, %s %s", i, cid, g)
        updatehitgroup(g, cid)
        count += 1

    execute_sql("commit;")

    log.info("updated crawl in %s", time.time() - st)

    return count
Ejemplo n.º 12
0
    def _requester_details(request, requester_id):
        def row_formatter(input):

            for cc in input:
                row = []
                row.append('<a href="%s">%s</a>' % (reverse('hit_group_details',kwargs={'hit_group_id':cc[3]}),cc[0]))
                row.extend(cc[1:3])
                yield row

        requster_name = HitGroupContent.objects.filter(requester_id = requester_id).values_list('requester_name',flat=True).distinct()

        if requster_name: requster_name = requster_name[0]
        else: requster_name = requester_id

        date_from = (datetime.date.today() - datetime.timedelta(days=30)).isoformat()

        data = query_to_tuples("""
    select
        title,
        p.reward,
        p.occurrence_date,
        p.group_id
    from main_hitgroupcontent p
        LEFT JOIN main_requesterprofile r ON p.requester_id = r.requester_id
    where
        p.requester_id = '%s'
        AND coalesce(r.is_public, true) = true
        and
        p.occurrence_date > TIMESTAMP '%s';
        """ % (requester_id, date_from))

        columns = [
            ('string', 'HIT Title'),
            ('number', 'Reward'),
            ('datetime', 'Posted'),
        ]
        ctx = {
            'data': text_row_formater(row_formatter(data)),
            'columns': tuple(columns),
            'title':'Tasks posted during last 30 days by %s' % (requster_name),
            'user': request.user,
        }
        return direct_to_template(request, 'main/requester_details.html',ctx)
Ejemplo n.º 13
0
def update_mviews():

    missing_crawls = query_to_tuples("""select id, start_time 
        from main_crawl p 
    where 
        p.success = true and 
        not exists (select id from main_crawlagregates where crawl_id = p.id ) and 
        old_id is null and 
        groups_downloaded > 200 and
        has_hits_mv = false
    order by id desc""")

    for row in missing_crawls:

        crawl_id, start_time = row

        logging.info("inserting missing crawl: %s" % crawl_id)

        execute_sql("delete from hits_mv where crawl_id = %s" % crawl_id)

        execute_sql("""INSERT INTO
                hits_mv
            SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id,
                TIMESTAMP '%s',
                q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted, null, q.is_spam
            FROM
                main_hitgroupstatus p
            JOIN
                main_hitgroupcontent q ON (q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id)
            WHERE
                p.crawl_id = %s;
        """ % (start_time, crawl_id))

        execute_sql("update main_crawl set has_hits_mv = true where id = %s" %
                    crawl_id)

        execute_sql('commit;')
Ejemplo n.º 14
0
def update_mviews():

    missing_crawls = query_to_tuples("""select id, start_time 
        from main_crawl p 
    where 
        p.success = true and 
        not exists (select id from main_crawlagregates where crawl_id = p.id ) and 
        old_id is null and 
        groups_downloaded > 200 and
        has_hits_mv = false
    order by id desc""")

    for row in missing_crawls:

        crawl_id, start_time = row
        
        logging.info("inserting missing crawl: %s" % crawl_id)        

        execute_sql("delete from hits_mv where crawl_id = %s" % crawl_id)        

        execute_sql("""INSERT INTO
                hits_mv
            SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id,
                TIMESTAMP '%s',
                q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted, null, q.is_spam
            FROM
                main_hitgroupstatus p
            JOIN
                main_hitgroupcontent q ON (q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id)
            WHERE
                p.crawl_id = %s;
        """ % (start_time, crawl_id))

        execute_sql("update main_crawl set has_hits_mv = true where id = %s" % crawl_id)        

        execute_sql('commit;')