def update_first_occured_agregates(): missing_crawls = query_to_tuples( """select id from main_crawl p where p.success = true and not exists (select crawl_id from main_hitgroupfirstoccurences where crawl_id = p.id );""" ) for row in missing_crawls: crawl_id = row[0] logging.info( "inserting missing crawl into main_hitgroupfirstoccurences: %s" % crawl_id) execute_sql("""INSERT INTO main_hitgroupfirstoccurences select p.reward, p.id, q.crawl_id, p.requester_name, q.id, p.occurrence_date, p.requester_id, p.group_id, q.hits_available, nextval('main_hitgroupfirstoccurences_id_seq'::regclass) from main_hitgroupcontent p join main_hitgroupstatus q on( p.first_crawl_id = q.crawl_id and q.hit_group_content_id = p.id ) where q.crawl_id = %s;""" % crawl_id) execute_sql('commit;')
def update_first_occured_agregates(): missing_crawls = query_to_tuples("""select id from main_crawl p where p.success = true and not exists (select crawl_id from main_hitgroupfirstoccurences where crawl_id = p.id );""") for row in missing_crawls: crawl_id = row[0] logging.info("inserting missing crawl into main_hitgroupfirstoccurences: %s" % crawl_id) execute_sql("""INSERT INTO main_hitgroupfirstoccurences select p.reward, p.id, q.crawl_id, p.requester_name, q.id, p.occurrence_date, p.requester_id, p.group_id, q.hits_available, nextval('main_hitgroupfirstoccurences_id_seq'::regclass) from main_hitgroupcontent p join main_hitgroupstatus q on( p.first_crawl_id = q.crawl_id and q.hit_group_content_id = p.id ) where q.crawl_id = %s;""" % crawl_id) execute_sql('commit;')
def topreq_data(days): from tenclouds.sql import query_to_tuples, execute_sql start_time = time.time() firstcrawl = execute_sql(""" SELECT crawl_id FROM hits_mv WHERE start_time > %s ORDER BY start_time ASC LIMIT 1;""", datetime.date.today() - datetime.timedelta(int(days))).fetchall()[0][0] return list(query_to_tuples(""" SELECT h.requester_id, h.requester_name, count(*) as "projects", sum(mv.hits_available) as "hits", sum(mv.hits_available*h.reward) as "reward", max(h.occurrence_date) as "last_posted" FROM main_hitgroupcontent h LEFT JOIN main_requesterprofile p ON h.requester_id = p.requester_id LEFT JOIN ( SELECT group_id, crawl_id, hits_available from hits_mv where crawl_id> %s ) mv ON (h.group_id=mv.group_id and h.first_crawl_id=mv.crawl_id) WHERE h.first_crawl_id > %s AND coalesce(p.is_public, true) = true group by h.requester_id, h.requester_name order by sum(mv.hits_available*h.reward) desc limit 1000;""" % (firstcrawl, firstcrawl)))
def topreq_data(days): from tenclouds.sql import query_to_tuples, execute_sql start_time = time.time() firstcrawl = execute_sql( """ SELECT crawl_id FROM hits_mv WHERE start_time > %s ORDER BY start_time ASC LIMIT 1;""", datetime.date.today() - datetime.timedelta(int(days))).fetchall()[0][0] return list( query_to_tuples(""" SELECT h.requester_id, h.requester_name, count(*) as "projects", sum(mv.hits_available) as "hits", sum(mv.hits_available*h.reward) as "reward", max(h.occurrence_date) as "last_posted" FROM main_hitgroupcontent h LEFT JOIN main_requesterprofile p ON h.requester_id = p.requester_id LEFT JOIN ( SELECT group_id, crawl_id, hits_available from hits_mv where crawl_id> %s ) mv ON (h.group_id=mv.group_id and h.first_crawl_id=mv.crawl_id) WHERE h.first_crawl_id > %s AND coalesce(p.is_public, true) = true group by h.requester_id, h.requester_name order by sum(mv.hits_available*h.reward) desc limit 1000;""" % (firstcrawl, firstcrawl)))
def top_requesters(request): def row_formatter(input): "Yield formatted rows" for cc in input: row = [] url = reverse('requester_details', kwargs={'requester_id': cc[0]}) row.append('<a href="%s">%s</a>' % (url, cc[1])) row.append( '<a href="https://www.mturk.com/mturk/searchbar?requesterId=%s" target="_mturk">%s</a> (<a href="http://feed.crowdsauced.com/r/req/%s">RSS</a>)' % (cc[0], cc[0], cc[0])) row.extend(cc[2:6]) url = reverse('admin-toggle-requester-status', args=(cc[0], )) row.append('<a href="%s">%s</a>' % (url, cc[6] and 'public' or 'private')) yield row date_30_days_before = datetime.date.today() - datetime.timedelta(days=30) data = row_formatter( query_to_tuples( ''' SELECT h.requester_id, h.requester_name, count(*) as "projects", sum(h.hits_available) as "hits", sum(h.hits_available*reward) as "reward", max(h.occurrence_date) as "last_posted", coalesce(p.is_public, true) as is_public FROM main_hitgroupfirstoccurences h LEFT JOIN main_requesterprofile p ON h.requester_id = p.requester_id WHERE h.occurrence_date > %s GROUP BY h.requester_id, h.requester_name, p.is_public ORDER BY sum(h.hits_available*reward) DESC LIMIT 1000; ''', date_30_days_before)) columns = ( ('string', 'Requester ID'), ('string', 'Requester'), ('number', '#Task'), ('number', '#HITs'), ('number', 'Rewards'), ('datetime', 'Last Posted On'), ('string', 'Status'), ) ctx = { 'data': data, 'columns': columns, 'title': 'Top-1000 Recent Requesters', } return direct_to_template(request, 'main/graphs/table.html', ctx)
def top_requesters(request): def row_formatter(input): "Yield formatted rows" for cc in input: row = [] url = reverse('requester_details', kwargs={'requester_id':cc[0]}) row.append('<a href="%s">%s</a>' % (url, cc[1])) row.append('<a href="https://www.mturk.com/mturk/searchbar?requesterId=%s" target="_mturk">%s</a> (<a href="http://feed.crowdsauced.com/r/req/%s">RSS</a>)' % (cc[0],cc[0],cc[0]) ) row.extend(cc[2:6]) url = reverse('admin-toggle-requester-status', args=(cc[0], )) row.append('<a href="%s">%s</a>' % (url, cc[6] and 'public' or 'private')) yield row date_30_days_before = datetime.date.today() - datetime.timedelta(days=30) data = row_formatter(query_to_tuples(''' SELECT h.requester_id, h.requester_name, count(*) as "projects", sum(h.hits_available) as "hits", sum(h.hits_available*reward) as "reward", max(h.occurrence_date) as "last_posted", coalesce(p.is_public, true) as is_public FROM main_hitgroupfirstoccurences h LEFT JOIN main_requesterprofile p ON h.requester_id = p.requester_id WHERE h.occurrence_date > %s GROUP BY h.requester_id, h.requester_name, p.is_public ORDER BY sum(h.hits_available*reward) DESC LIMIT 1000; ''', date_30_days_before)) columns = ( ('string','Requester ID'), ('string','Requester'), ('number','#Task'), ('number','#HITs'), ('number','Rewards'), ('datetime', 'Last Posted On'), ('string', 'Status'), ) ctx = { 'data': data, 'columns': columns, 'title': 'Top-1000 Recent Requesters', } return direct_to_template(request, 'main/graphs/table.html', ctx)
def requester_details(request, requester_id): def row_formatter(input): for cc in input: row = [] url = reverse('hit_group_details', kwargs={'hit_group_id': cc[5]}) row.append('<a href="%s">%s</a>' % (url, cc[0])) row.extend(cc[1:5]) url = reverse('admin-toggle-hitgroup-status', args=(cc[5], )) row.append('<a href="%s">%s</a>' % (url, cc[6] and 'public' or 'private')) yield row requester_name = HitGroupContent.objects.filter(requester_id=requester_id) requester_name = requester_name.values_list('requester_name', flat=True).distinct() if requester_name: requester_name = requester_name[0] else: requester_name = requester_id date_from = datetime.date.today() - datetime.timedelta(days=30) data = query_to_tuples( """ SELECT title, hits_available, p.reward, p.occurrence_date, (SELECT end_time FROM main_crawl WHERE id = (SELECT max(crawl_id) FROM main_hitgroupstatus WHERE group_id = q.group_id AND hit_group_content_id = p.group_content_id)) - p.occurrence_date, p.group_id, q.is_public FROM main_hitgroupfirstoccurences p JOIN main_hitgroupcontent q ON (p.group_content_id = q.id AND p.requester_id = q.requester_id) WHERE p.requester_id = %s AND p.occurrence_date > %s AND q.occurrence_date > %s """, requester_id, date_from, date_from) columns = ( ('string', 'HIT Title'), ('number', '#HITs'), ('number', 'Reward'), ('datetime', 'Posted'), ('number', 'Duration (Days)'), ('string', 'Status'), ) ctx = { 'data': text_row_formater(row_formatter(data)), 'columns': tuple(columns), 'title': 'Last 100 Tasks posted by %s' % requester_name, 'user': request.user, } return direct_to_template(request, 'main/requester_details.html', ctx)
def requester_details(request, requester_id): def row_formatter(input): for cc in input: row = [] url = reverse('hit_group_details',kwargs={'hit_group_id':cc[5]}) row.append('<a href="%s">%s</a>' % (url, cc[0])) row.extend(cc[1:5]) url = reverse('admin-toggle-hitgroup-status', args=(cc[5],)) row.append('<a href="%s">%s</a>' % (url, cc[6] and 'public' or 'private')) yield row requester_name = HitGroupContent.objects.filter(requester_id=requester_id) requester_name = requester_name.values_list('requester_name',flat=True).distinct() if requester_name: requester_name = requester_name[0] else: requester_name = requester_id date_from = datetime.date.today() - datetime.timedelta(days=30) data = query_to_tuples(""" SELECT title, hits_available, p.reward, p.occurrence_date, (SELECT end_time FROM main_crawl WHERE id = (SELECT max(crawl_id) FROM main_hitgroupstatus WHERE group_id = q.group_id AND hit_group_content_id = p.group_content_id)) - p.occurrence_date, p.group_id, q.is_public FROM main_hitgroupfirstoccurences p JOIN main_hitgroupcontent q ON (p.group_content_id = q.id AND p.requester_id = q.requester_id) WHERE p.requester_id = %s AND p.occurrence_date > %s AND q.occurrence_date > %s """, requester_id, date_from, date_from) columns = ( ('string', 'HIT Title'), ('number', '#HITs'), ('number', 'Reward'), ('datetime', 'Posted'), ('number', 'Duration (Days)'), ('string', 'Status'), ) ctx = { 'data': text_row_formater(row_formatter(data)), 'columns': tuple(columns), 'title': 'Last 100 Tasks posted by %s' % requester_name, 'user': request.user, } return direct_to_template(request, 'main/requester_details.html', ctx)
def _requester_details(request, requester_id): def row_formatter(input): for cc in input: row = [] row.append('<a href="%s">%s</a>' % (reverse('hit_group_details', kwargs={'hit_group_id': cc[3]}), cc[0])) row.extend(cc[1:3]) yield row requster_name = HitGroupContent.objects.filter( requester_id=requester_id).values_list('requester_name', flat=True).distinct() if requster_name: requster_name = requster_name[0] else: requster_name = requester_id date_from = (datetime.date.today() - datetime.timedelta(days=30)).isoformat() data = query_to_tuples(""" select title, p.reward, p.occurrence_date, p.group_id from main_hitgroupcontent p LEFT JOIN main_requesterprofile r ON p.requester_id = r.requester_id where p.requester_id = '%s' AND coalesce(r.is_public, true) = true and p.occurrence_date > TIMESTAMP '%s'; """ % (requester_id, date_from)) columns = [ ('string', 'HIT Title'), ('number', 'Reward'), ('datetime', 'Posted'), ] ctx = { 'data': text_row_formater(row_formatter(data)), 'columns': tuple(columns), 'title': 'Tasks posted during last 30 days by %s' % (requster_name), 'user': request.user, } return direct_to_template(request, 'main/requester_details.html', ctx)
def update_cid(cid): st = time.time() count = 0 for i, g in enumerate(query_to_tuples("select distinct group_id from hits_mv where crawl_id = %s", cid)): g = g[0] log.info("processing %s, %s %s", i, cid, g) updatehitgroup(g, cid) count += 1 execute_sql("commit;") log.info("updated crawl in %s", time.time() - st) return count
def update_cid(cid): st = time.time() count = 0 for i, g in enumerate( query_to_tuples( "select distinct group_id from hits_mv where crawl_id = %s", cid)): g = g[0] log.info("processing %s, %s %s", i, cid, g) updatehitgroup(g, cid) count += 1 execute_sql("commit;") log.info("updated crawl in %s", time.time() - st) return count
def _requester_details(request, requester_id): def row_formatter(input): for cc in input: row = [] row.append('<a href="%s">%s</a>' % (reverse('hit_group_details',kwargs={'hit_group_id':cc[3]}),cc[0])) row.extend(cc[1:3]) yield row requster_name = HitGroupContent.objects.filter(requester_id = requester_id).values_list('requester_name',flat=True).distinct() if requster_name: requster_name = requster_name[0] else: requster_name = requester_id date_from = (datetime.date.today() - datetime.timedelta(days=30)).isoformat() data = query_to_tuples(""" select title, p.reward, p.occurrence_date, p.group_id from main_hitgroupcontent p LEFT JOIN main_requesterprofile r ON p.requester_id = r.requester_id where p.requester_id = '%s' AND coalesce(r.is_public, true) = true and p.occurrence_date > TIMESTAMP '%s'; """ % (requester_id, date_from)) columns = [ ('string', 'HIT Title'), ('number', 'Reward'), ('datetime', 'Posted'), ] ctx = { 'data': text_row_formater(row_formatter(data)), 'columns': tuple(columns), 'title':'Tasks posted during last 30 days by %s' % (requster_name), 'user': request.user, } return direct_to_template(request, 'main/requester_details.html',ctx)
def update_mviews(): missing_crawls = query_to_tuples("""select id, start_time from main_crawl p where p.success = true and not exists (select id from main_crawlagregates where crawl_id = p.id ) and old_id is null and groups_downloaded > 200 and has_hits_mv = false order by id desc""") for row in missing_crawls: crawl_id, start_time = row logging.info("inserting missing crawl: %s" % crawl_id) execute_sql("delete from hits_mv where crawl_id = %s" % crawl_id) execute_sql("""INSERT INTO hits_mv SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id, TIMESTAMP '%s', q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted, null, q.is_spam FROM main_hitgroupstatus p JOIN main_hitgroupcontent q ON (q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id) WHERE p.crawl_id = %s; """ % (start_time, crawl_id)) execute_sql("update main_crawl set has_hits_mv = true where id = %s" % crawl_id) execute_sql('commit;')