Beispiel #1
0
def update_crawl_agregates(commit_threshold=10, only_new = True):

    results = None

    if only_new:
        results = query_to_dicts("select id from main_crawl p where old_id is null and not exists(select id from main_crawlagregates where crawl_id = p.id)")
    else:
        results = query_to_dicts("select id from main_crawl p where not exists(select id from main_crawlagregates where crawl_id = p.id)")

    logging.info("got results")

    for i, row in enumerate(results):
        try:

            execute_sql("""
            INSERT INTO
                main_crawlagregates
            SELECT
                sum(hits_available) as "hits",
                start_time,
                sum(reward * hits_available) as "reward",
                crawl_id,
                nextval('main_crawlagregates_id_seq'),
                count(*) as "count",
                0
            FROM
                (SELECT DISTINCT ON (group_id) * FROM hits_mv WHERE crawl_id = %s) AS p
            GROUP BY
                crawl_id, start_time
            """, row['id'])

            execute_sql("""UPDATE main_crawlagregates 
                set spam_projects = 
                    ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                where crawl_id = %s""" % (row['id'], row['id']) ) 

            print """UPDATE main_crawlagregates 
                set spam_projects = 
                    ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                where crawl_id = %s"""


            logging.info("update agregates for %s" % row['id'])

            if i % commit_threshold == 0:
                logging.info( 'commited after %s crawls' % i )
                execute_sql('commit;')

        except:
            error_info = grab_error(sys.exc_info())
            logging.error('an error occured at crawl_id: %s, %s %s' % (row['id'],error_info['type'], error_info['value']))
            execute_sql('rollback;')


    # delete dummy data
    execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;")
    execute_sql("COMMIT;")
Beispiel #2
0
def arrivals(request):

    params = {
        'multichart': False,
        'columns':ARRIVALS_COLUMNS,
        'title': 'New Tasks/HITs/$$$ per day'
    }

    def arrivals_data_formater(input):
        for cc in input:
            yield {
                    'date': cc['start_time'],
                    'row': (str(cc['hits']), str(cc['reward'])),
            }    

    date_from = (datetime.date.today() - datetime.timedelta(days=30)).isoformat()
    date_to = (datetime.date.today() + datetime.timedelta(days=1)).isoformat()

    if request.method == 'GET' and 'date_from' in request.GET and 'date_to' in request.GET:

        date_from = datetime.datetime(*time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6])
        date_to = datetime.datetime(*time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6])
        params['date_from'] = request.GET['date_from']
        params['date_to'] = request.GET['date_to']

    data = arrivals_data_formater(query_to_dicts('''
        select date as "start_time", arrivals as "hits", arrivals_value as "reward"
        from main_daystats where date >= '%s' and date <= '%s'
    ''' % (date_from,date_to)))

    params['data'] = data

    return direct_to_template(request, 'main/graphs/timeline.html', params)
 def handle(self, **options):
     
     try:
     
         commit_threshold = 10
         results = query_to_dicts("select id from main_crawl p where not exists(select id from hits_mv where crawl_id = p.id)")
         
         for i, row in enumerate(results):
             
             execute_sql("""insert into hits_mv 
 SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id, ( SELECT main_crawl.start_time
            FROM main_crawl
           WHERE main_crawl.id = p.crawl_id) AS start_time, q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted
    FROM main_hitgroupstatus p
    JOIN main_hitgroupcontent q ON q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id
   WHERE p.crawl_id = %s            
             """ % row['id'])                    
             
             if i % commit_threshold == 0:
                 print datetime.datetime.now(), 'commited after %s crawls' % i
                 execute_sql('commit;')  
         
         
     except:
         error_info = grab_error(sys.exc_info())
         print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value'])
         execute_sql('rollback;')  
Beispiel #4
0
def completed(request):

    params = {
        'columns': DEFAULT_COLUMNS,
        'title': 'Tasks/HITs/$$$ completed per day'
    }

    date_from = (datetime.date.today() - datetime.timedelta(days=30)).isoformat()
    date_to = (datetime.date.today() + datetime.timedelta(days=1)).isoformat()

    if request.method == 'GET' and 'date_from' in request.GET and 'date_to' in request.GET:

        date_from = datetime.datetime(*time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6])
        date_to = datetime.datetime(*time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6])
        params['date_from'] = request.GET['date_from']
        params['date_to'] = request.GET['date_to']

    data = data_formater(query_to_dicts('''
        select date as "start_time", day_start_hits - day_end_hits as "hits", day_start_reward - day_end_reward as "reward", day_start_projects - day_end_projects as "count"
            from main_daystats where day_end_hits != 0 and date >= '%s' and date <= '%s'
    ''' % (date_from,date_to)))

    params['data'] = data

    return direct_to_template(request, 'main/graphs/timeline.html', params)
Beispiel #5
0
def completed(request):

    params = {
        'columns': DEFAULT_COLUMNS,
        'title': 'Tasks/HITs/$$$ completed per day'
    }

    date_from = (datetime.date.today() -
                 datetime.timedelta(days=30)).isoformat()
    date_to = (datetime.date.today() + datetime.timedelta(days=1)).isoformat()

    if request.method == 'GET' and 'date_from' in request.GET and 'date_to' in request.GET:

        date_from = datetime.datetime(
            *time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6])
        date_to = datetime.datetime(
            *time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6])
        params['date_from'] = request.GET['date_from']
        params['date_to'] = request.GET['date_to']

    data = data_formater(
        query_to_dicts('''
        select date as "start_time", day_start_hits - day_end_hits as "hits", day_start_reward - day_end_reward as "reward", day_start_projects - day_end_projects as "count"
            from main_daystats where day_end_hits != 0 and date >= '%s' and date <= '%s'
    ''' % (date_from, date_to)))

    params['data'] = data

    return direct_to_template(request, 'main/graphs/timeline.html', params)
Beispiel #6
0
def general(request):

    params = {
        'multichart': True,
        'columns': GENERAL_COLUMNS,
        'title': 'General Data'
    }

    if 'date_from' in request.GET:
        date_from = datetime.datetime(
            *time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6])
    else:
        date_from = datetime.datetime.now() - datetime.timedelta(days=7)

    if 'date_to' in request.GET:
        date_to = datetime.datetime(
            *time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6])
    else:
        date_to = datetime.datetime.now()

    params['date_from'] = date_from.strftime('%m/%d/%Y')
    params['date_to'] = date_to.strftime('%m/%d/%Y')

    data = data_formater(
        query_to_dicts(
            '''
        select reward, hits, projects as "count", spam_projects, start_time
            from main_crawlagregates
            where start_time >= %s and start_time <= %s
            order by start_time asc
        ''', date_from, date_to))

    def _is_anomaly(a, others):
        mid = sum(map(lambda e: int(e['row'][0]), others)) / len(others)
        return abs(mid - int(a['row'][0])) > 7000

    def _fixer(a, others):
        val = sum(map(lambda e: int(e['row'][0]), others)) / len(others)
        a['row'] = (str(val), a['row'][1], a['row'][2], a['row'][3])
        return a

    if settings.DATASMOOTHING:
        params['data'] = plot.repair(list(data), _is_anomaly, _fixer, 2)
    else:
        params['data'] = list(data)

    return direct_to_template(request, 'main/graphs/timeline.html', params)
Beispiel #7
0
def general(request):

    params = {
        'multichart': True,
        'columns':GENERAL_COLUMNS,
        'title': 'General Data'
    }

    if 'date_from' in request.GET:
        date_from = datetime.datetime(
                *time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6])
    else:
        date_from = datetime.datetime.now() - datetime.timedelta(days=7)

    if 'date_to' in request.GET:
        date_to = datetime.datetime(
                *time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6])
    else:
        date_to = datetime.datetime.now()

    params['date_from'] = date_from.strftime('%m/%d/%Y')
    params['date_to'] = date_to.strftime('%m/%d/%Y')

    data = data_formater(query_to_dicts('''
        select reward, hits, projects as "count", spam_projects, start_time
            from main_crawlagregates
            where start_time >= %s and start_time <= %s
            order by start_time asc
        ''', date_from, date_to))

    def _is_anomaly(a, others):
        mid = sum(map(lambda e: int(e['row'][0]), others)) / len(others)
        return abs(mid - int(a['row'][0])) > 7000

    def _fixer(a, others):
        val = sum(map(lambda e: int(e['row'][0]), others)) / len(others)
        a['row'] = (str(val), a['row'][1], a['row'][2], a['row'][3])
        return a

    if settings.DATASMOOTHING:
        params['data'] = plot.repair(list(data), _is_anomaly, _fixer, 2)
    else:
        params['data'] = list(data)

    return direct_to_template(request, 'main/graphs/timeline.html', params)
Beispiel #8
0
def calculate_first_crawl_id():

    progress = 10
    results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null")
    logging.info('got missing ids results')
    for i,r in enumerate(results):
        logging.info("\tprocessing %s" % r['id'])
        execute_sql("""update main_hitgroupcontent p set first_crawl_id =
            (select min(crawl_id) from main_hitgroupstatus where hit_group_content_id = p.id)
            where
                id = %s
        """ % r['id'])

        if i % progress == 0:
            execute_sql('commit;')
            logging.info("updated %s main_hitgroupcontent rows with first_crawl_id" % i)



    execute_sql('commit;')
Beispiel #9
0
def clean_duplicates():

    ids = query_to_dicts("select group_id from main_hitgroupcontent group by group_id having count(*) > 1;")

    for id in ids:
        print "deleting %s" % id['group_id']
        logging.info( "deleting %s" % id['group_id'] )

        execute_sql("""delete from main_hitgroupstatus where
                        hit_group_content_id  in (
                            select id from main_hitgroupcontent where id !=
                                (select min(id) from main_hitgroupcontent where group_id = '%s')
                        and group_id = '%s');
        """ % (id['group_id'], id['group_id']))

        execute_sql("""delete from main_hitgroupcontent where
                        id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s'
                    """ % (id['group_id'], id['group_id']))

    execute_sql('commit;')
Beispiel #10
0
def calculate_first_crawl_id():

    progress = 10
    results = query_to_dicts(
        "select id from main_hitgroupcontent where first_crawl_id is null")
    logging.info('got missing ids results')
    for i, r in enumerate(results):
        logging.info("\tprocessing %s" % r['id'])
        execute_sql("""update main_hitgroupcontent p set first_crawl_id =
            (select min(crawl_id) from main_hitgroupstatus where hit_group_content_id = p.id)
            where
                id = %s
        """ % r['id'])

        if i % progress == 0:
            execute_sql('commit;')
            logging.info(
                "updated %s main_hitgroupcontent rows with first_crawl_id" % i)

    execute_sql('commit;')
Beispiel #11
0
def clean_duplicates():

    ids = query_to_dicts(
        "select group_id from main_hitgroupcontent group by group_id having count(*) > 1;"
    )

    for id in ids:
        print "deleting %s" % id['group_id']
        logging.info("deleting %s" % id['group_id'])

        execute_sql("""delete from main_hitgroupstatus where
                        hit_group_content_id  in (
                            select id from main_hitgroupcontent where id !=
                                (select min(id) from main_hitgroupcontent where group_id = '%s')
                        and group_id = '%s');
        """ % (id['group_id'], id['group_id']))

        execute_sql("""delete from main_hitgroupcontent where
                        id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s'
                    """ % (id['group_id'], id['group_id']))

    execute_sql('commit;')
Beispiel #12
0
def arrivals(request):

    params = {
        'multichart': False,
        'columns': ARRIVALS_COLUMNS,
        'title': 'New Tasks/HITs/$$$ per day'
    }

    def arrivals_data_formater(input):
        for cc in input:
            yield {
                'date': cc['start_time'],
                'row': (str(cc['hits']), str(cc['reward'])),
            }

    date_from = (datetime.date.today() -
                 datetime.timedelta(days=30)).isoformat()
    date_to = (datetime.date.today() + datetime.timedelta(days=1)).isoformat()

    if request.method == 'GET' and 'date_from' in request.GET and 'date_to' in request.GET:

        date_from = datetime.datetime(
            *time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6])
        date_to = datetime.datetime(
            *time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6])
        params['date_from'] = request.GET['date_from']
        params['date_to'] = request.GET['date_to']

    data = arrivals_data_formater(
        query_to_dicts('''
        select date as "start_time", arrivals as "hits", arrivals_value as "reward"
        from main_daystats where date >= '%s' and date <= '%s'
    ''' % (date_from, date_to)))

    params['data'] = data

    return direct_to_template(request, 'main/graphs/timeline.html', params)
 def handle(self, **options):
     
     try:
         
         commit_threshold = 1000
         results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null")
         
         for i, row in enumerate(results):
             
             execute_sql("""update main_hitgroupcontent p set first_crawl_id = 
                 (select crawl_id from main_hitgroupstatus where group_id = p.group_id order by crawl_id asc LIMIT 1)
                 where 
                     id = %s
             """ % row['id'])                    
             
             if i % commit_threshold == 0:
                 print datetime.datetime.now(), 'commited after %s rows' % i
                 execute_sql('commit;')
             
     except:
         error_info = grab_error(sys.exc_info())
         print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value'])
         execute_sql('rollback;')                  
         
    def handle(self, **options):
        """
        take all crawls that have diffs computed from - inf till yesterday
        start from the oldest crawl and try to create daily stats 
        if there are some crawls without diffs stop the process
        """
        '''
        take earliest crawl
        calculate incosistencies in history
        calculate data for every missing element
        '''
        crawl = get_first_crawl()
        if not crawl:
            logger.error("no crawls in db")
            return

        transaction.enter_transaction_management()
        transaction.managed(True)

        for i in range(0, (today() - crawl.start_day()).days):

            day = crawl.start_day() + datetime.timedelta(days=i)
            day_end = day + datetime.timedelta(days=1)

            crawls = Crawl.objects.filter(has_diffs=False,
                                          start_time__gte=day,
                                          start_time__lt=day_end)
            if len(crawls) > 0:
                logger.error("not all crawls from %s have diffs" % day)
                continue

            try:
                DayStats.objects.get(date=day)
            except DayStats.DoesNotExist:  #@UndefinedVariable
                logger.info(
                    "db_calculate_daily_stats: calculating stats for: %s" %
                    day)

                range_start_date = day.isoformat()
                range_end_date = (day_end).isoformat()

                logger.info("calculating arrivals")
                '''
                stats for projects posted on particular day
                '''
                arrivals = query_to_dicts('''
                    select sum(hits_diff) as "arrivals", sum(hits_diff*reward) as "arrivals_value"
                    from 
                        hits_mv p
                    where
                     start_time between TIMESTAMP '%s' and TIMESTAMP '%s'
                        and hits_diff > 0
                    ''' % (range_start_date, range_end_date)).next()

                logger.info("calculating processed")

                processed = query_to_dicts('''
                    select sum(hits_diff) as "processed", sum(hits_diff*reward) as "processed_value"
                    from 
                        hits_mv p
                    where
                     start_time between TIMESTAMP '%s' and TIMESTAMP '%s'
                        and hits_diff < 0
                    ''' % (range_start_date, range_end_date)).next()

                DayStats.objects.create(
                    date=day,
                    arrivals=arrivals['arrivals'],
                    arrivals_value=arrivals['arrivals_value'],
                    processed=processed['processed'],
                    processed_value=processed['processed_value'])

                transaction.commit()
Beispiel #15
0
    def handle(self, **options):
        """
        Take ${lmit} last crawls without spam classification
        Classify all hit groups, update hits_mv to have proper hit classification
        Rebuild crawl_aggregates for a given crawl
        Refresh memcache
        """

        service = get_prediction_service()

        pid = Pid('classify_spam', True)

        transaction.enter_transaction_management()
        transaction.managed(True)

        start_time = time.time()

        try:

            number_of_predictions = 0

            for c in list(
                    Crawl.objects.filter(is_spam_computed=False).order_by(
                        '-id')[:options['limit']]):

                log.info("processing %s", c)

                spam = set([])
                not_spam = set([])

                updated = 0

                for row in query_to_dicts(
                        """select content_id, group_id, is_spam from hits_mv 
                    where 
                        crawl_id = %s""", c.id):

                    log.info("classyfing crawl_id: %s, %s", c.id, row)

                    if row['is_spam'] is None:

                        is_spam = None
                        content = HitGroupContent.objects.get(
                            id=row['content_id'])

                        if content.is_spam is None:
                            data = content.prepare_for_prediction()

                            body = {'input': {'csvInstance': data}}
                            prediction = service.predict(
                                body=body, data=options['file']).execute()

                            number_of_predictions += 1
                            updated += 1

                            content.is_spam = prediction['outputLabel'] != 'No'
                            content.save()

                        execute_sql(
                            "update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'"
                            % ('true' if content.is_spam else 'false', c.id,
                               row['group_id']))
                        transaction.commit()

                        if content.is_spam:
                            log.info("detected spam for %s", row)
                            spam.add(str(row['content_id']))
                        else:
                            not_spam.add(str(row['content_id']))

                    else:
                        log.info("is_spam already computed for %s" % row)

                if updated > 0:
                    c.is_spam_computed = True
                    c.save()

                log.info("done classyfing crawl")

                execute_sql("""UPDATE main_crawlagregates 
                    set spam_projects = 
                        ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                    where crawl_id = %s""" % (c.id, c.id))

                transaction.commit()

                log.info("dome processing %s", c)

        except (KeyError, KeyboardInterrupt, HttpError), e:
            log.error(e)
            transaction.rollback()
            pid.remove_pid()
            exit()
    def handle(self, **options):
        
        """
        take all crawls that have diffs computed from - inf till yesterday
        start from the oldest crawl and try to create daily stats 
        if there are some crawls without diffs stop the process
        """


        '''
        take earliest crawl
        calculate incosistencies in history
        calculate data for every missing element
        '''
        crawl = get_first_crawl()
        if not crawl:
            logger.error("no crawls in db")
            return
        
        transaction.enter_transaction_management()
        transaction.managed(True)
        
        
        for i in range(0,(today() - crawl.start_day()).days):
            
            day = crawl.start_day()+datetime.timedelta(days=i)
            day_end = day + datetime.timedelta(days=1)
            
            crawls = Crawl.objects.filter(has_diffs=False, start_time__gte=day, start_time__lt=day_end )
            if len(crawls)>0:
                logger.error("not all crawls from %s have diffs" % day)
                continue

            try:
                DayStats.objects.get(date = day)
            except DayStats.DoesNotExist: #@UndefinedVariable
                logger.info("db_calculate_daily_stats: calculating stats for: %s" % day)
                
                range_start_date   = day.isoformat()
                range_end_date     = (day_end).isoformat()
                
                logger.info("calculating arrivals")

                '''
                stats for projects posted on particular day
                '''
                arrivals = query_to_dicts('''
                    select sum(hits_diff) as "arrivals", sum(hits_diff*reward) as "arrivals_value"
                    from 
                        hits_mv p
                    where
                     start_time between TIMESTAMP '%s' and TIMESTAMP '%s'
                        and hits_diff > 0
                    ''' % ( range_start_date, range_end_date)).next()

                logger.info("calculating processed")

                processed = query_to_dicts('''
                    select sum(hits_diff) as "processed", sum(hits_diff*reward) as "processed_value"
                    from 
                        hits_mv p
                    where
                     start_time between TIMESTAMP '%s' and TIMESTAMP '%s'
                        and hits_diff < 0
                    ''' % ( range_start_date, range_end_date)).next()                    
                                
                DayStats.objects.create(date = day,
                                        
                                        arrivals = arrivals['arrivals'],
                                        arrivals_value = arrivals['arrivals_value'],
                                        processed = processed['processed'],
                                        processed_value = processed['processed_value']
                                        )
                
                transaction.commit()
Beispiel #17
0
    def handle(self, **options):

        """
        Take ${lmit} last crawls without spam classification
        Classify all hit groups, update hits_mv to have proper hit classification
        Rebuild crawl_aggregates for a given crawl
        Refresh memcache
        """

        service = get_prediction_service()

        pid = Pid('classify_spam', True)

        transaction.enter_transaction_management()
        transaction.managed(True)

        start_time = time.time()

        try:

            number_of_predictions = 0

            for c in list(Crawl.objects.filter(is_spam_computed=False).order_by('-id')[:options['limit']]):

                log.info("processing %s", c)

                spam = set([])
                not_spam = set([])
                
                updated = 0

                for row in query_to_dicts("""select content_id, group_id, is_spam from hits_mv 
                    where 
                        crawl_id = %s""", c.id):

                    log.info("classyfing crawl_id: %s, %s", c.id,row)

                    if row['is_spam'] is None:

                        is_spam = None
                        content = HitGroupContent.objects.get(id= row['content_id'])

                        if content.is_spam is None:
                            data = content.prepare_for_prediction()

                            body = {'input': {'csvInstance': data}}
                            prediction = service.predict(body=body, data=options['file']).execute()
                            
                            number_of_predictions += 1
                            updated += 1                    
                            
                            content.is_spam = prediction['outputLabel'] != 'No'
                            content.save()

                        execute_sql("update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'" % ('true' if content.is_spam else 'false', c.id, row['group_id']))       
                        transaction.commit()
                            
                        if content.is_spam:
                            log.info("detected spam for %s", row)
                            spam.add(str(row['content_id']))
                        else:
                            not_spam.add(str(row['content_id']))

                    else:
                        log.info("is_spam already computed for %s" % row)
                
                if updated > 0:
                    c.is_spam_computed=True
                    c.save()

                log.info("done classyfing crawl")

                execute_sql("""UPDATE main_crawlagregates 
                    set spam_projects = 
                        ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                    where crawl_id = %s""" % (c.id, c.id) ) 


                transaction.commit()

                log.info("dome processing %s", c)

        except (KeyError, KeyboardInterrupt, HttpError), e:
            log.error(e)
            transaction.rollback()
            pid.remove_pid()
            exit()            
Beispiel #18
0
def update_crawl_agregates(commit_threshold=10, only_new=True):

    results = None

    if only_new:
        results = query_to_dicts(
            "select id from main_crawl p where old_id is null and not exists(select id from main_crawlagregates where crawl_id = p.id)"
        )
    else:
        results = query_to_dicts(
            "select id from main_crawl p where not exists(select id from main_crawlagregates where crawl_id = p.id)"
        )

    logging.info("got results")

    for i, row in enumerate(results):
        try:

            execute_sql(
                """
            INSERT INTO
                main_crawlagregates
            SELECT
                sum(hits_available) as "hits",
                start_time,
                sum(reward * hits_available) as "reward",
                crawl_id,
                nextval('main_crawlagregates_id_seq'),
                count(*) as "count",
                0
            FROM
                (SELECT DISTINCT ON (group_id) * FROM hits_mv WHERE crawl_id = %s) AS p
            GROUP BY
                crawl_id, start_time
            """, row['id'])

            execute_sql("""UPDATE main_crawlagregates 
                set spam_projects = 
                    ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                where crawl_id = %s""" % (row['id'], row['id']))

            print """UPDATE main_crawlagregates 
                set spam_projects = 
                    ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                where crawl_id = %s"""

            logging.info("update agregates for %s" % row['id'])

            if i % commit_threshold == 0:
                logging.info('commited after %s crawls' % i)
                execute_sql('commit;')

        except:
            error_info = grab_error(sys.exc_info())
            logging.error('an error occured at crawl_id: %s, %s %s' %
                          (row['id'], error_info['type'], error_info['value']))
            execute_sql('rollback;')

    # delete dummy data
    execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;")
    execute_sql("COMMIT;")