def update_crawl_agregates(commit_threshold=10, only_new = True): results = None if only_new: results = query_to_dicts("select id from main_crawl p where old_id is null and not exists(select id from main_crawlagregates where crawl_id = p.id)") else: results = query_to_dicts("select id from main_crawl p where not exists(select id from main_crawlagregates where crawl_id = p.id)") logging.info("got results") for i, row in enumerate(results): try: execute_sql(""" INSERT INTO main_crawlagregates SELECT sum(hits_available) as "hits", start_time, sum(reward * hits_available) as "reward", crawl_id, nextval('main_crawlagregates_id_seq'), count(*) as "count", 0 FROM (SELECT DISTINCT ON (group_id) * FROM hits_mv WHERE crawl_id = %s) AS p GROUP BY crawl_id, start_time """, row['id']) execute_sql("""UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" % (row['id'], row['id']) ) print """UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" logging.info("update agregates for %s" % row['id']) if i % commit_threshold == 0: logging.info( 'commited after %s crawls' % i ) execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) logging.error('an error occured at crawl_id: %s, %s %s' % (row['id'],error_info['type'], error_info['value'])) execute_sql('rollback;') # delete dummy data execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;") execute_sql("COMMIT;")
def arrivals(request): params = { 'multichart': False, 'columns':ARRIVALS_COLUMNS, 'title': 'New Tasks/HITs/$$$ per day' } def arrivals_data_formater(input): for cc in input: yield { 'date': cc['start_time'], 'row': (str(cc['hits']), str(cc['reward'])), } date_from = (datetime.date.today() - datetime.timedelta(days=30)).isoformat() date_to = (datetime.date.today() + datetime.timedelta(days=1)).isoformat() if request.method == 'GET' and 'date_from' in request.GET and 'date_to' in request.GET: date_from = datetime.datetime(*time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6]) date_to = datetime.datetime(*time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6]) params['date_from'] = request.GET['date_from'] params['date_to'] = request.GET['date_to'] data = arrivals_data_formater(query_to_dicts(''' select date as "start_time", arrivals as "hits", arrivals_value as "reward" from main_daystats where date >= '%s' and date <= '%s' ''' % (date_from,date_to))) params['data'] = data return direct_to_template(request, 'main/graphs/timeline.html', params)
def handle(self, **options): try: commit_threshold = 10 results = query_to_dicts("select id from main_crawl p where not exists(select id from hits_mv where crawl_id = p.id)") for i, row in enumerate(results): execute_sql("""insert into hits_mv SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id, ( SELECT main_crawl.start_time FROM main_crawl WHERE main_crawl.id = p.crawl_id) AS start_time, q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted FROM main_hitgroupstatus p JOIN main_hitgroupcontent q ON q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id WHERE p.crawl_id = %s """ % row['id']) if i % commit_threshold == 0: print datetime.datetime.now(), 'commited after %s crawls' % i execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value']) execute_sql('rollback;')
def completed(request): params = { 'columns': DEFAULT_COLUMNS, 'title': 'Tasks/HITs/$$$ completed per day' } date_from = (datetime.date.today() - datetime.timedelta(days=30)).isoformat() date_to = (datetime.date.today() + datetime.timedelta(days=1)).isoformat() if request.method == 'GET' and 'date_from' in request.GET and 'date_to' in request.GET: date_from = datetime.datetime(*time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6]) date_to = datetime.datetime(*time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6]) params['date_from'] = request.GET['date_from'] params['date_to'] = request.GET['date_to'] data = data_formater(query_to_dicts(''' select date as "start_time", day_start_hits - day_end_hits as "hits", day_start_reward - day_end_reward as "reward", day_start_projects - day_end_projects as "count" from main_daystats where day_end_hits != 0 and date >= '%s' and date <= '%s' ''' % (date_from,date_to))) params['data'] = data return direct_to_template(request, 'main/graphs/timeline.html', params)
def completed(request): params = { 'columns': DEFAULT_COLUMNS, 'title': 'Tasks/HITs/$$$ completed per day' } date_from = (datetime.date.today() - datetime.timedelta(days=30)).isoformat() date_to = (datetime.date.today() + datetime.timedelta(days=1)).isoformat() if request.method == 'GET' and 'date_from' in request.GET and 'date_to' in request.GET: date_from = datetime.datetime( *time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6]) date_to = datetime.datetime( *time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6]) params['date_from'] = request.GET['date_from'] params['date_to'] = request.GET['date_to'] data = data_formater( query_to_dicts(''' select date as "start_time", day_start_hits - day_end_hits as "hits", day_start_reward - day_end_reward as "reward", day_start_projects - day_end_projects as "count" from main_daystats where day_end_hits != 0 and date >= '%s' and date <= '%s' ''' % (date_from, date_to))) params['data'] = data return direct_to_template(request, 'main/graphs/timeline.html', params)
def general(request): params = { 'multichart': True, 'columns': GENERAL_COLUMNS, 'title': 'General Data' } if 'date_from' in request.GET: date_from = datetime.datetime( *time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6]) else: date_from = datetime.datetime.now() - datetime.timedelta(days=7) if 'date_to' in request.GET: date_to = datetime.datetime( *time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6]) else: date_to = datetime.datetime.now() params['date_from'] = date_from.strftime('%m/%d/%Y') params['date_to'] = date_to.strftime('%m/%d/%Y') data = data_formater( query_to_dicts( ''' select reward, hits, projects as "count", spam_projects, start_time from main_crawlagregates where start_time >= %s and start_time <= %s order by start_time asc ''', date_from, date_to)) def _is_anomaly(a, others): mid = sum(map(lambda e: int(e['row'][0]), others)) / len(others) return abs(mid - int(a['row'][0])) > 7000 def _fixer(a, others): val = sum(map(lambda e: int(e['row'][0]), others)) / len(others) a['row'] = (str(val), a['row'][1], a['row'][2], a['row'][3]) return a if settings.DATASMOOTHING: params['data'] = plot.repair(list(data), _is_anomaly, _fixer, 2) else: params['data'] = list(data) return direct_to_template(request, 'main/graphs/timeline.html', params)
def general(request): params = { 'multichart': True, 'columns':GENERAL_COLUMNS, 'title': 'General Data' } if 'date_from' in request.GET: date_from = datetime.datetime( *time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6]) else: date_from = datetime.datetime.now() - datetime.timedelta(days=7) if 'date_to' in request.GET: date_to = datetime.datetime( *time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6]) else: date_to = datetime.datetime.now() params['date_from'] = date_from.strftime('%m/%d/%Y') params['date_to'] = date_to.strftime('%m/%d/%Y') data = data_formater(query_to_dicts(''' select reward, hits, projects as "count", spam_projects, start_time from main_crawlagregates where start_time >= %s and start_time <= %s order by start_time asc ''', date_from, date_to)) def _is_anomaly(a, others): mid = sum(map(lambda e: int(e['row'][0]), others)) / len(others) return abs(mid - int(a['row'][0])) > 7000 def _fixer(a, others): val = sum(map(lambda e: int(e['row'][0]), others)) / len(others) a['row'] = (str(val), a['row'][1], a['row'][2], a['row'][3]) return a if settings.DATASMOOTHING: params['data'] = plot.repair(list(data), _is_anomaly, _fixer, 2) else: params['data'] = list(data) return direct_to_template(request, 'main/graphs/timeline.html', params)
def calculate_first_crawl_id(): progress = 10 results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null") logging.info('got missing ids results') for i,r in enumerate(results): logging.info("\tprocessing %s" % r['id']) execute_sql("""update main_hitgroupcontent p set first_crawl_id = (select min(crawl_id) from main_hitgroupstatus where hit_group_content_id = p.id) where id = %s """ % r['id']) if i % progress == 0: execute_sql('commit;') logging.info("updated %s main_hitgroupcontent rows with first_crawl_id" % i) execute_sql('commit;')
def clean_duplicates(): ids = query_to_dicts("select group_id from main_hitgroupcontent group by group_id having count(*) > 1;") for id in ids: print "deleting %s" % id['group_id'] logging.info( "deleting %s" % id['group_id'] ) execute_sql("""delete from main_hitgroupstatus where hit_group_content_id in ( select id from main_hitgroupcontent where id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s'); """ % (id['group_id'], id['group_id'])) execute_sql("""delete from main_hitgroupcontent where id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s' """ % (id['group_id'], id['group_id'])) execute_sql('commit;')
def calculate_first_crawl_id(): progress = 10 results = query_to_dicts( "select id from main_hitgroupcontent where first_crawl_id is null") logging.info('got missing ids results') for i, r in enumerate(results): logging.info("\tprocessing %s" % r['id']) execute_sql("""update main_hitgroupcontent p set first_crawl_id = (select min(crawl_id) from main_hitgroupstatus where hit_group_content_id = p.id) where id = %s """ % r['id']) if i % progress == 0: execute_sql('commit;') logging.info( "updated %s main_hitgroupcontent rows with first_crawl_id" % i) execute_sql('commit;')
def clean_duplicates(): ids = query_to_dicts( "select group_id from main_hitgroupcontent group by group_id having count(*) > 1;" ) for id in ids: print "deleting %s" % id['group_id'] logging.info("deleting %s" % id['group_id']) execute_sql("""delete from main_hitgroupstatus where hit_group_content_id in ( select id from main_hitgroupcontent where id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s'); """ % (id['group_id'], id['group_id'])) execute_sql("""delete from main_hitgroupcontent where id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s' """ % (id['group_id'], id['group_id'])) execute_sql('commit;')
def arrivals(request): params = { 'multichart': False, 'columns': ARRIVALS_COLUMNS, 'title': 'New Tasks/HITs/$$$ per day' } def arrivals_data_formater(input): for cc in input: yield { 'date': cc['start_time'], 'row': (str(cc['hits']), str(cc['reward'])), } date_from = (datetime.date.today() - datetime.timedelta(days=30)).isoformat() date_to = (datetime.date.today() + datetime.timedelta(days=1)).isoformat() if request.method == 'GET' and 'date_from' in request.GET and 'date_to' in request.GET: date_from = datetime.datetime( *time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6]) date_to = datetime.datetime( *time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6]) params['date_from'] = request.GET['date_from'] params['date_to'] = request.GET['date_to'] data = arrivals_data_formater( query_to_dicts(''' select date as "start_time", arrivals as "hits", arrivals_value as "reward" from main_daystats where date >= '%s' and date <= '%s' ''' % (date_from, date_to))) params['data'] = data return direct_to_template(request, 'main/graphs/timeline.html', params)
def handle(self, **options): try: commit_threshold = 1000 results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null") for i, row in enumerate(results): execute_sql("""update main_hitgroupcontent p set first_crawl_id = (select crawl_id from main_hitgroupstatus where group_id = p.group_id order by crawl_id asc LIMIT 1) where id = %s """ % row['id']) if i % commit_threshold == 0: print datetime.datetime.now(), 'commited after %s rows' % i execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value']) execute_sql('rollback;')
def handle(self, **options): """ take all crawls that have diffs computed from - inf till yesterday start from the oldest crawl and try to create daily stats if there are some crawls without diffs stop the process """ ''' take earliest crawl calculate incosistencies in history calculate data for every missing element ''' crawl = get_first_crawl() if not crawl: logger.error("no crawls in db") return transaction.enter_transaction_management() transaction.managed(True) for i in range(0, (today() - crawl.start_day()).days): day = crawl.start_day() + datetime.timedelta(days=i) day_end = day + datetime.timedelta(days=1) crawls = Crawl.objects.filter(has_diffs=False, start_time__gte=day, start_time__lt=day_end) if len(crawls) > 0: logger.error("not all crawls from %s have diffs" % day) continue try: DayStats.objects.get(date=day) except DayStats.DoesNotExist: #@UndefinedVariable logger.info( "db_calculate_daily_stats: calculating stats for: %s" % day) range_start_date = day.isoformat() range_end_date = (day_end).isoformat() logger.info("calculating arrivals") ''' stats for projects posted on particular day ''' arrivals = query_to_dicts(''' select sum(hits_diff) as "arrivals", sum(hits_diff*reward) as "arrivals_value" from hits_mv p where start_time between TIMESTAMP '%s' and TIMESTAMP '%s' and hits_diff > 0 ''' % (range_start_date, range_end_date)).next() logger.info("calculating processed") processed = query_to_dicts(''' select sum(hits_diff) as "processed", sum(hits_diff*reward) as "processed_value" from hits_mv p where start_time between TIMESTAMP '%s' and TIMESTAMP '%s' and hits_diff < 0 ''' % (range_start_date, range_end_date)).next() DayStats.objects.create( date=day, arrivals=arrivals['arrivals'], arrivals_value=arrivals['arrivals_value'], processed=processed['processed'], processed_value=processed['processed_value']) transaction.commit()
def handle(self, **options): """ Take ${lmit} last crawls without spam classification Classify all hit groups, update hits_mv to have proper hit classification Rebuild crawl_aggregates for a given crawl Refresh memcache """ service = get_prediction_service() pid = Pid('classify_spam', True) transaction.enter_transaction_management() transaction.managed(True) start_time = time.time() try: number_of_predictions = 0 for c in list( Crawl.objects.filter(is_spam_computed=False).order_by( '-id')[:options['limit']]): log.info("processing %s", c) spam = set([]) not_spam = set([]) updated = 0 for row in query_to_dicts( """select content_id, group_id, is_spam from hits_mv where crawl_id = %s""", c.id): log.info("classyfing crawl_id: %s, %s", c.id, row) if row['is_spam'] is None: is_spam = None content = HitGroupContent.objects.get( id=row['content_id']) if content.is_spam is None: data = content.prepare_for_prediction() body = {'input': {'csvInstance': data}} prediction = service.predict( body=body, data=options['file']).execute() number_of_predictions += 1 updated += 1 content.is_spam = prediction['outputLabel'] != 'No' content.save() execute_sql( "update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'" % ('true' if content.is_spam else 'false', c.id, row['group_id'])) transaction.commit() if content.is_spam: log.info("detected spam for %s", row) spam.add(str(row['content_id'])) else: not_spam.add(str(row['content_id'])) else: log.info("is_spam already computed for %s" % row) if updated > 0: c.is_spam_computed = True c.save() log.info("done classyfing crawl") execute_sql("""UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" % (c.id, c.id)) transaction.commit() log.info("dome processing %s", c) except (KeyError, KeyboardInterrupt, HttpError), e: log.error(e) transaction.rollback() pid.remove_pid() exit()
def handle(self, **options): """ take all crawls that have diffs computed from - inf till yesterday start from the oldest crawl and try to create daily stats if there are some crawls without diffs stop the process """ ''' take earliest crawl calculate incosistencies in history calculate data for every missing element ''' crawl = get_first_crawl() if not crawl: logger.error("no crawls in db") return transaction.enter_transaction_management() transaction.managed(True) for i in range(0,(today() - crawl.start_day()).days): day = crawl.start_day()+datetime.timedelta(days=i) day_end = day + datetime.timedelta(days=1) crawls = Crawl.objects.filter(has_diffs=False, start_time__gte=day, start_time__lt=day_end ) if len(crawls)>0: logger.error("not all crawls from %s have diffs" % day) continue try: DayStats.objects.get(date = day) except DayStats.DoesNotExist: #@UndefinedVariable logger.info("db_calculate_daily_stats: calculating stats for: %s" % day) range_start_date = day.isoformat() range_end_date = (day_end).isoformat() logger.info("calculating arrivals") ''' stats for projects posted on particular day ''' arrivals = query_to_dicts(''' select sum(hits_diff) as "arrivals", sum(hits_diff*reward) as "arrivals_value" from hits_mv p where start_time between TIMESTAMP '%s' and TIMESTAMP '%s' and hits_diff > 0 ''' % ( range_start_date, range_end_date)).next() logger.info("calculating processed") processed = query_to_dicts(''' select sum(hits_diff) as "processed", sum(hits_diff*reward) as "processed_value" from hits_mv p where start_time between TIMESTAMP '%s' and TIMESTAMP '%s' and hits_diff < 0 ''' % ( range_start_date, range_end_date)).next() DayStats.objects.create(date = day, arrivals = arrivals['arrivals'], arrivals_value = arrivals['arrivals_value'], processed = processed['processed'], processed_value = processed['processed_value'] ) transaction.commit()
def handle(self, **options): """ Take ${lmit} last crawls without spam classification Classify all hit groups, update hits_mv to have proper hit classification Rebuild crawl_aggregates for a given crawl Refresh memcache """ service = get_prediction_service() pid = Pid('classify_spam', True) transaction.enter_transaction_management() transaction.managed(True) start_time = time.time() try: number_of_predictions = 0 for c in list(Crawl.objects.filter(is_spam_computed=False).order_by('-id')[:options['limit']]): log.info("processing %s", c) spam = set([]) not_spam = set([]) updated = 0 for row in query_to_dicts("""select content_id, group_id, is_spam from hits_mv where crawl_id = %s""", c.id): log.info("classyfing crawl_id: %s, %s", c.id,row) if row['is_spam'] is None: is_spam = None content = HitGroupContent.objects.get(id= row['content_id']) if content.is_spam is None: data = content.prepare_for_prediction() body = {'input': {'csvInstance': data}} prediction = service.predict(body=body, data=options['file']).execute() number_of_predictions += 1 updated += 1 content.is_spam = prediction['outputLabel'] != 'No' content.save() execute_sql("update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'" % ('true' if content.is_spam else 'false', c.id, row['group_id'])) transaction.commit() if content.is_spam: log.info("detected spam for %s", row) spam.add(str(row['content_id'])) else: not_spam.add(str(row['content_id'])) else: log.info("is_spam already computed for %s" % row) if updated > 0: c.is_spam_computed=True c.save() log.info("done classyfing crawl") execute_sql("""UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" % (c.id, c.id) ) transaction.commit() log.info("dome processing %s", c) except (KeyError, KeyboardInterrupt, HttpError), e: log.error(e) transaction.rollback() pid.remove_pid() exit()
def update_crawl_agregates(commit_threshold=10, only_new=True): results = None if only_new: results = query_to_dicts( "select id from main_crawl p where old_id is null and not exists(select id from main_crawlagregates where crawl_id = p.id)" ) else: results = query_to_dicts( "select id from main_crawl p where not exists(select id from main_crawlagregates where crawl_id = p.id)" ) logging.info("got results") for i, row in enumerate(results): try: execute_sql( """ INSERT INTO main_crawlagregates SELECT sum(hits_available) as "hits", start_time, sum(reward * hits_available) as "reward", crawl_id, nextval('main_crawlagregates_id_seq'), count(*) as "count", 0 FROM (SELECT DISTINCT ON (group_id) * FROM hits_mv WHERE crawl_id = %s) AS p GROUP BY crawl_id, start_time """, row['id']) execute_sql("""UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" % (row['id'], row['id'])) print """UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" logging.info("update agregates for %s" % row['id']) if i % commit_threshold == 0: logging.info('commited after %s crawls' % i) execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) logging.error('an error occured at crawl_id: %s, %s %s' % (row['id'], error_info['type'], error_info['value'])) execute_sql('rollback;') # delete dummy data execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;") execute_sql("COMMIT;")