Ejemplo n.º 1
0
def update_first_occured_agregates():

    missing_crawls = query_to_tuples(
        """select id from main_crawl p where p.success = true and not exists (select crawl_id from main_hitgroupfirstoccurences where crawl_id = p.id );"""
    )

    for row in missing_crawls:

        crawl_id = row[0]
        logging.info(
            "inserting missing crawl into main_hitgroupfirstoccurences: %s" %
            crawl_id)

        execute_sql("""INSERT INTO
                main_hitgroupfirstoccurences
                    select
                        p.reward,
                        p.id,
                        q.crawl_id,
                        p.requester_name,
                        q.id,
                        p.occurrence_date,
                        p.requester_id,
                        p.group_id,
                        q.hits_available,
                        nextval('main_hitgroupfirstoccurences_id_seq'::regclass)
                    from main_hitgroupcontent p join main_hitgroupstatus q
                        on( p.first_crawl_id = q.crawl_id and q.hit_group_content_id = p.id )
                        where q.crawl_id = %s;""" % crawl_id)

        execute_sql('commit;')
Ejemplo n.º 2
0
def update_diffs(limit=100):
    start_time = time.time()
    execute_sql(
        """
        UPDATE hits_mv
            SET hits_diff = diffs.hits_diff
        FROM
            (SELECT
                group_id, crawl_id,
                    hits_available -
                    COALESCE(lag(hits_available) over (partition BY group_id ORDER BY crawl_id), 0)
                    AS hits_diff
            FROM hits_mv
            WHERE
                hits_diff is NULL
            AND crawl_id in
                (SELECT DISTINCT crawl_id
                FROM hits_mv
                WHERE hits_diff is NULL
                ORDER BY crawl_id DESC LIMIT %s)

            ) AS diffs

        WHERE (diffs.group_id = hits_mv.group_id) AND (diffs.crawl_id = hits_mv.crawl_id);""",
        (int(limit), ))
    execute_sql('commit;')

    log.debug('Updated diffs for %s crawls in %s\n%s', limit,
              (time.time() - start_time))
Ejemplo n.º 3
0
def update_first_occured_agregates():

    missing_crawls = query_to_tuples("""select id from main_crawl p where p.success = true and not exists (select crawl_id from main_hitgroupfirstoccurences where crawl_id = p.id );""")

    for row in missing_crawls:

        crawl_id = row[0]
        logging.info("inserting missing crawl into main_hitgroupfirstoccurences: %s" % crawl_id)

        execute_sql("""INSERT INTO
                main_hitgroupfirstoccurences
                    select
                        p.reward,
                        p.id,
                        q.crawl_id,
                        p.requester_name,
                        q.id,
                        p.occurrence_date,
                        p.requester_id,
                        p.group_id,
                        q.hits_available,
                        nextval('main_hitgroupfirstoccurences_id_seq'::regclass)
                    from main_hitgroupcontent p join main_hitgroupstatus q
                        on( p.first_crawl_id = q.crawl_id and q.hit_group_content_id = p.id )
                        where q.crawl_id = %s;""" % crawl_id)

        execute_sql('commit;')
Ejemplo n.º 4
0
def update_diffs(limit=100):
    start_time = time.time()
    execute_sql("""
        UPDATE hits_mv
            SET hits_diff = diffs.hits_diff
        FROM
            (SELECT
                group_id, crawl_id,
                    hits_available -
                    COALESCE(lag(hits_available) over (partition BY group_id ORDER BY crawl_id), 0)
                    AS hits_diff
            FROM hits_mv
            WHERE
                hits_diff is NULL
            AND crawl_id in
                (SELECT DISTINCT crawl_id
                FROM hits_mv
                WHERE hits_diff is NULL
                ORDER BY crawl_id DESC LIMIT %s)

            ) AS diffs

        WHERE (diffs.group_id = hits_mv.group_id) AND (diffs.crawl_id = hits_mv.crawl_id);""",
        (int(limit), ))
    execute_sql('commit;')

    log.debug('Updated diffs for %s crawls in %s\n%s', limit, (time.time() - start_time))
Ejemplo n.º 5
0
def updatehitgroup(g, cid):
    prev = execute_sql("""select hits_available from hits_mv 
                where 
                    crawl_id between %s and %s and 
                    group_id = '%s' 
                order by crawl_id desc 
                limit 1;""" % (cid-100, cid-1, g)).fetchall()
    prev = prev[0][0] if prev else 0
    
    execute_sql("""update hits_mv set hits_diff = hits_available - %s where
            group_id = '%s' and crawl_id = %s;""" % (prev, g, cid))
Ejemplo n.º 6
0
def updatehitgroup(g, cid):
    prev = execute_sql("""select hits_available from hits_mv 
                where 
                    crawl_id between %s and %s and 
                    group_id = '%s' 
                order by crawl_id desc 
                limit 1;""" % (cid - 100, cid - 1, g)).fetchall()
    prev = prev[0][0] if prev else 0

    execute_sql("""update hits_mv set hits_diff = hits_available - %s where
            group_id = '%s' and crawl_id = %s;""" % (prev, g, cid))
Ejemplo n.º 7
0
def update_cid(cid):

    st = time.time()
    count = 0
    for i, g in enumerate(query_to_tuples("select distinct group_id from hits_mv where crawl_id = %s", cid)):
        g = g[0]
        log.info("processing %s, %s %s", i, cid,  g)
        updatehitgroup(g, cid)
        count += 1

    execute_sql("commit;")

    log.info("updated crawl in %s", time.time() - st)

    return count
Ejemplo n.º 8
0
def topreq_data(days):
    from tenclouds.sql import query_to_tuples, execute_sql
    start_time = time.time()
    firstcrawl = execute_sql(
        """
        SELECT crawl_id
        FROM hits_mv
        WHERE
            start_time > %s
        ORDER BY start_time ASC
        LIMIT 1;""",
        datetime.date.today() - datetime.timedelta(int(days))).fetchall()[0][0]

    return list(
        query_to_tuples("""
        SELECT
            h.requester_id,
            h.requester_name,
            count(*) as "projects",
            sum(mv.hits_available) as "hits",
            sum(mv.hits_available*h.reward) as "reward",
            max(h.occurrence_date) as "last_posted"
        FROM
                main_hitgroupcontent h
                LEFT JOIN main_requesterprofile p ON h.requester_id = p.requester_id
                LEFT JOIN (
                    SELECT group_id, crawl_id, hits_available from
                    hits_mv where crawl_id> %s
                ) mv ON (h.group_id=mv.group_id and h.first_crawl_id=mv.crawl_id)
            WHERE
                h.first_crawl_id > %s
                AND coalesce(p.is_public, true) = true
            group by h.requester_id, h.requester_name
            order by sum(mv.hits_available*h.reward) desc
            limit 1000;""" % (firstcrawl, firstcrawl)))
Ejemplo n.º 9
0
def topreq_data(days):
    from tenclouds.sql import query_to_tuples, execute_sql
    start_time = time.time()
    firstcrawl = execute_sql("""
        SELECT crawl_id
        FROM hits_mv
        WHERE
            start_time > %s
        ORDER BY start_time ASC
        LIMIT 1;""", datetime.date.today() - datetime.timedelta(int(days))).fetchall()[0][0]

    return list(query_to_tuples("""
        SELECT
            h.requester_id,
            h.requester_name,
            count(*) as "projects",
            sum(mv.hits_available) as "hits",
            sum(mv.hits_available*h.reward) as "reward",
            max(h.occurrence_date) as "last_posted"
        FROM
                main_hitgroupcontent h
                LEFT JOIN main_requesterprofile p ON h.requester_id = p.requester_id
                LEFT JOIN (
                    SELECT group_id, crawl_id, hits_available from
                    hits_mv where crawl_id> %s
                ) mv ON (h.group_id=mv.group_id and h.first_crawl_id=mv.crawl_id)
            WHERE
                h.first_crawl_id > %s
                AND coalesce(p.is_public, true) = true
            group by h.requester_id, h.requester_name
            order by sum(mv.hits_available*h.reward) desc
            limit 1000;""" % (firstcrawl, firstcrawl)))
Ejemplo n.º 10
0
def update_cid(cid):

    st = time.time()
    count = 0
    for i, g in enumerate(
            query_to_tuples(
                "select distinct group_id from hits_mv where crawl_id = %s",
                cid)):
        g = g[0]
        log.info("processing %s, %s %s", i, cid, g)
        updatehitgroup(g, cid)
        count += 1

    execute_sql("commit;")

    log.info("updated crawl in %s", time.time() - st)

    return count
 def handle(self, **options):
     
     update_crawl_agregates(only_new = False)
     
     f = open(os.path.join(settings.ROOT_PATH,'crawl.errors.csv'),"rb")
     progress = 10
     
     execute_sql("update main_crawl set success = true where old_id is not null")
     
     for i,id in enumerate(f):
         id = id.strip()
         execute_sql("delete from main_crawlagregates where crawl_id = (select id from main_crawl where old_id = %s)" % id)
         execute_sql("update main_crawl set success = false where old_id = %s" % id)
         
         if i % progress == 0:
             print "processed %s rows" % i
             execute_sql("commit;")
         
     execute_sql("commit;")
         
     
Ejemplo n.º 12
0
 def handle(self, **options):
     
     try:
     
         commit_threshold = 10
         results = query_to_dicts("select id from main_crawl p where not exists(select id from hits_mv where crawl_id = p.id)")
         
         for i, row in enumerate(results):
             
             execute_sql("""insert into hits_mv 
 SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id, ( SELECT main_crawl.start_time
            FROM main_crawl
           WHERE main_crawl.id = p.crawl_id) AS start_time, q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted
    FROM main_hitgroupstatus p
    JOIN main_hitgroupcontent q ON q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id
   WHERE p.crawl_id = %s            
             """ % row['id'])                    
             
             if i % commit_threshold == 0:
                 print datetime.datetime.now(), 'commited after %s crawls' % i
                 execute_sql('commit;')  
         
         
     except:
         error_info = grab_error(sys.exc_info())
         print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value'])
         execute_sql('rollback;')  
Ejemplo n.º 13
0
    def handle(self, **options):

        update_crawl_agregates(only_new=False)

        f = open(os.path.join(settings.ROOT_PATH, 'crawl.errors.csv'), "rb")
        progress = 10

        execute_sql(
            "update main_crawl set success = true where old_id is not null")

        for i, id in enumerate(f):
            id = id.strip()
            execute_sql(
                "delete from main_crawlagregates where crawl_id = (select id from main_crawl where old_id = %s)"
                % id)
            execute_sql(
                "update main_crawl set success = false where old_id = %s" % id)

            if i % progress == 0:
                print "processed %s rows" % i
                execute_sql("commit;")

        execute_sql("commit;")
Ejemplo n.º 14
0
def update_mviews():

    missing_crawls = query_to_tuples("""select id, start_time 
        from main_crawl p 
    where 
        p.success = true and 
        not exists (select id from main_crawlagregates where crawl_id = p.id ) and 
        old_id is null and 
        groups_downloaded > 200 and
        has_hits_mv = false
    order by id desc""")

    for row in missing_crawls:

        crawl_id, start_time = row

        logging.info("inserting missing crawl: %s" % crawl_id)

        execute_sql("delete from hits_mv where crawl_id = %s" % crawl_id)

        execute_sql("""INSERT INTO
                hits_mv
            SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id,
                TIMESTAMP '%s',
                q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted, null, q.is_spam
            FROM
                main_hitgroupstatus p
            JOIN
                main_hitgroupcontent q ON (q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id)
            WHERE
                p.crawl_id = %s;
        """ % (start_time, crawl_id))

        execute_sql("update main_crawl set has_hits_mv = true where id = %s" %
                    crawl_id)

        execute_sql('commit;')
Ejemplo n.º 15
0
def update_mviews():

    missing_crawls = query_to_tuples("""select id, start_time 
        from main_crawl p 
    where 
        p.success = true and 
        not exists (select id from main_crawlagregates where crawl_id = p.id ) and 
        old_id is null and 
        groups_downloaded > 200 and
        has_hits_mv = false
    order by id desc""")

    for row in missing_crawls:

        crawl_id, start_time = row
        
        logging.info("inserting missing crawl: %s" % crawl_id)        

        execute_sql("delete from hits_mv where crawl_id = %s" % crawl_id)        

        execute_sql("""INSERT INTO
                hits_mv
            SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id,
                TIMESTAMP '%s',
                q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted, null, q.is_spam
            FROM
                main_hitgroupstatus p
            JOIN
                main_hitgroupcontent q ON (q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id)
            WHERE
                p.crawl_id = %s;
        """ % (start_time, crawl_id))

        execute_sql("update main_crawl set has_hits_mv = true where id = %s" % crawl_id)        

        execute_sql('commit;')
Ejemplo n.º 16
0
def calculate_first_crawl_id():

    progress = 10
    results = query_to_dicts(
        "select id from main_hitgroupcontent where first_crawl_id is null")
    logging.info('got missing ids results')
    for i, r in enumerate(results):
        logging.info("\tprocessing %s" % r['id'])
        execute_sql("""update main_hitgroupcontent p set first_crawl_id =
            (select min(crawl_id) from main_hitgroupstatus where hit_group_content_id = p.id)
            where
                id = %s
        """ % r['id'])

        if i % progress == 0:
            execute_sql('commit;')
            logging.info(
                "updated %s main_hitgroupcontent rows with first_crawl_id" % i)

    execute_sql('commit;')
Ejemplo n.º 17
0
def calculate_first_crawl_id():

    progress = 10
    results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null")
    logging.info('got missing ids results')
    for i,r in enumerate(results):
        logging.info("\tprocessing %s" % r['id'])
        execute_sql("""update main_hitgroupcontent p set first_crawl_id =
            (select min(crawl_id) from main_hitgroupstatus where hit_group_content_id = p.id)
            where
                id = %s
        """ % r['id'])

        if i % progress == 0:
            execute_sql('commit;')
            logging.info("updated %s main_hitgroupcontent rows with first_crawl_id" % i)



    execute_sql('commit;')
Ejemplo n.º 18
0
def clean_duplicates():

    ids = query_to_dicts("select group_id from main_hitgroupcontent group by group_id having count(*) > 1;")

    for id in ids:
        print "deleting %s" % id['group_id']
        logging.info( "deleting %s" % id['group_id'] )

        execute_sql("""delete from main_hitgroupstatus where
                        hit_group_content_id  in (
                            select id from main_hitgroupcontent where id !=
                                (select min(id) from main_hitgroupcontent where group_id = '%s')
                        and group_id = '%s');
        """ % (id['group_id'], id['group_id']))

        execute_sql("""delete from main_hitgroupcontent where
                        id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s'
                    """ % (id['group_id'], id['group_id']))

    execute_sql('commit;')
Ejemplo n.º 19
0
def clean_duplicates():

    ids = query_to_dicts(
        "select group_id from main_hitgroupcontent group by group_id having count(*) > 1;"
    )

    for id in ids:
        print "deleting %s" % id['group_id']
        logging.info("deleting %s" % id['group_id'])

        execute_sql("""delete from main_hitgroupstatus where
                        hit_group_content_id  in (
                            select id from main_hitgroupcontent where id !=
                                (select min(id) from main_hitgroupcontent where group_id = '%s')
                        and group_id = '%s');
        """ % (id['group_id'], id['group_id']))

        execute_sql("""delete from main_hitgroupcontent where
                        id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s'
                    """ % (id['group_id'], id['group_id']))

    execute_sql('commit;')
 def handle(self, **options):
     
     try:
         
         commit_threshold = 1000
         results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null")
         
         for i, row in enumerate(results):
             
             execute_sql("""update main_hitgroupcontent p set first_crawl_id = 
                 (select crawl_id from main_hitgroupstatus where group_id = p.group_id order by crawl_id asc LIMIT 1)
                 where 
                     id = %s
             """ % row['id'])                    
             
             if i % commit_threshold == 0:
                 print datetime.datetime.now(), 'commited after %s rows' % i
                 execute_sql('commit;')
             
     except:
         error_info = grab_error(sys.exc_info())
         print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value'])
         execute_sql('rollback;')                  
         
Ejemplo n.º 21
0
def last_crawlids(limit=10):
    r = execute_sql("select crawl_id from hits_mv order by crawl_id desc limit %s", limit)
    return [c[0] for c in r.fetchall()]
Ejemplo n.º 22
0
    def handle(self, **options):

        """
        Take ${lmit} last crawls without spam classification
        Classify all hit groups, update hits_mv to have proper hit classification
        Rebuild crawl_aggregates for a given crawl
        Refresh memcache
        """

        service = get_prediction_service()

        pid = Pid('classify_spam', True)

        transaction.enter_transaction_management()
        transaction.managed(True)

        start_time = time.time()

        try:

            number_of_predictions = 0

            for c in list(Crawl.objects.filter(is_spam_computed=False).order_by('-id')[:options['limit']]):

                log.info("processing %s", c)

                spam = set([])
                not_spam = set([])
                
                updated = 0

                for row in query_to_dicts("""select content_id, group_id, is_spam from hits_mv 
                    where 
                        crawl_id = %s""", c.id):

                    log.info("classyfing crawl_id: %s, %s", c.id,row)

                    if row['is_spam'] is None:

                        is_spam = None
                        content = HitGroupContent.objects.get(id= row['content_id'])

                        if content.is_spam is None:
                            data = content.prepare_for_prediction()

                            body = {'input': {'csvInstance': data}}
                            prediction = service.predict(body=body, data=options['file']).execute()
                            
                            number_of_predictions += 1
                            updated += 1                    
                            
                            content.is_spam = prediction['outputLabel'] != 'No'
                            content.save()

                        execute_sql("update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'" % ('true' if content.is_spam else 'false', c.id, row['group_id']))       
                        transaction.commit()
                            
                        if content.is_spam:
                            log.info("detected spam for %s", row)
                            spam.add(str(row['content_id']))
                        else:
                            not_spam.add(str(row['content_id']))

                    else:
                        log.info("is_spam already computed for %s" % row)
                
                if updated > 0:
                    c.is_spam_computed=True
                    c.save()

                log.info("done classyfing crawl")

                execute_sql("""UPDATE main_crawlagregates 
                    set spam_projects = 
                        ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                    where crawl_id = %s""" % (c.id, c.id) ) 


                transaction.commit()

                log.info("dome processing %s", c)

        except (KeyError, KeyboardInterrupt, HttpError), e:
            log.error(e)
            transaction.rollback()
            pid.remove_pid()
            exit()            
Ejemplo n.º 23
0
    def handle(self, **options):
        """
        Take ${lmit} last crawls without spam classification
        Classify all hit groups, update hits_mv to have proper hit classification
        Rebuild crawl_aggregates for a given crawl
        Refresh memcache
        """

        service = get_prediction_service()

        pid = Pid('classify_spam', True)

        transaction.enter_transaction_management()
        transaction.managed(True)

        start_time = time.time()

        try:

            number_of_predictions = 0

            for c in list(
                    Crawl.objects.filter(is_spam_computed=False).order_by(
                        '-id')[:options['limit']]):

                log.info("processing %s", c)

                spam = set([])
                not_spam = set([])

                updated = 0

                for row in query_to_dicts(
                        """select content_id, group_id, is_spam from hits_mv 
                    where 
                        crawl_id = %s""", c.id):

                    log.info("classyfing crawl_id: %s, %s", c.id, row)

                    if row['is_spam'] is None:

                        is_spam = None
                        content = HitGroupContent.objects.get(
                            id=row['content_id'])

                        if content.is_spam is None:
                            data = content.prepare_for_prediction()

                            body = {'input': {'csvInstance': data}}
                            prediction = service.predict(
                                body=body, data=options['file']).execute()

                            number_of_predictions += 1
                            updated += 1

                            content.is_spam = prediction['outputLabel'] != 'No'
                            content.save()

                        execute_sql(
                            "update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'"
                            % ('true' if content.is_spam else 'false', c.id,
                               row['group_id']))
                        transaction.commit()

                        if content.is_spam:
                            log.info("detected spam for %s", row)
                            spam.add(str(row['content_id']))
                        else:
                            not_spam.add(str(row['content_id']))

                    else:
                        log.info("is_spam already computed for %s" % row)

                if updated > 0:
                    c.is_spam_computed = True
                    c.save()

                log.info("done classyfing crawl")

                execute_sql("""UPDATE main_crawlagregates 
                    set spam_projects = 
                        ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                    where crawl_id = %s""" % (c.id, c.id))

                transaction.commit()

                log.info("dome processing %s", c)

        except (KeyError, KeyboardInterrupt, HttpError), e:
            log.error(e)
            transaction.rollback()
            pid.remove_pid()
            exit()
Ejemplo n.º 24
0
def last_crawlid():
    return execute_sql("select crawl_id from hits_mv order by crawl_id desc limit 1;").fetchall()[0][0]
Ejemplo n.º 25
0
def update_crawl_agregates(commit_threshold=10, only_new=True):

    results = None

    if only_new:
        results = query_to_dicts(
            "select id from main_crawl p where old_id is null and not exists(select id from main_crawlagregates where crawl_id = p.id)"
        )
    else:
        results = query_to_dicts(
            "select id from main_crawl p where not exists(select id from main_crawlagregates where crawl_id = p.id)"
        )

    logging.info("got results")

    for i, row in enumerate(results):
        try:

            execute_sql(
                """
            INSERT INTO
                main_crawlagregates
            SELECT
                sum(hits_available) as "hits",
                start_time,
                sum(reward * hits_available) as "reward",
                crawl_id,
                nextval('main_crawlagregates_id_seq'),
                count(*) as "count",
                0
            FROM
                (SELECT DISTINCT ON (group_id) * FROM hits_mv WHERE crawl_id = %s) AS p
            GROUP BY
                crawl_id, start_time
            """, row['id'])

            execute_sql("""UPDATE main_crawlagregates 
                set spam_projects = 
                    ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                where crawl_id = %s""" % (row['id'], row['id']))

            print """UPDATE main_crawlagregates 
                set spam_projects = 
                    ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                where crawl_id = %s"""

            logging.info("update agregates for %s" % row['id'])

            if i % commit_threshold == 0:
                logging.info('commited after %s crawls' % i)
                execute_sql('commit;')

        except:
            error_info = grab_error(sys.exc_info())
            logging.error('an error occured at crawl_id: %s, %s %s' %
                          (row['id'], error_info['type'], error_info['value']))
            execute_sql('rollback;')

    # delete dummy data
    execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;")
    execute_sql("COMMIT;")
Ejemplo n.º 26
0
def update_crawl_agregates(commit_threshold=10, only_new = True):

    results = None

    if only_new:
        results = query_to_dicts("select id from main_crawl p where old_id is null and not exists(select id from main_crawlagregates where crawl_id = p.id)")
    else:
        results = query_to_dicts("select id from main_crawl p where not exists(select id from main_crawlagregates where crawl_id = p.id)")

    logging.info("got results")

    for i, row in enumerate(results):
        try:

            execute_sql("""
            INSERT INTO
                main_crawlagregates
            SELECT
                sum(hits_available) as "hits",
                start_time,
                sum(reward * hits_available) as "reward",
                crawl_id,
                nextval('main_crawlagregates_id_seq'),
                count(*) as "count",
                0
            FROM
                (SELECT DISTINCT ON (group_id) * FROM hits_mv WHERE crawl_id = %s) AS p
            GROUP BY
                crawl_id, start_time
            """, row['id'])

            execute_sql("""UPDATE main_crawlagregates 
                set spam_projects = 
                    ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                where crawl_id = %s""" % (row['id'], row['id']) ) 

            print """UPDATE main_crawlagregates 
                set spam_projects = 
                    ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                where crawl_id = %s"""


            logging.info("update agregates for %s" % row['id'])

            if i % commit_threshold == 0:
                logging.info( 'commited after %s crawls' % i )
                execute_sql('commit;')

        except:
            error_info = grab_error(sys.exc_info())
            logging.error('an error occured at crawl_id: %s, %s %s' % (row['id'],error_info['type'], error_info['value']))
            execute_sql('rollback;')


    # delete dummy data
    execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;")
    execute_sql("COMMIT;")
Ejemplo n.º 27
0
def hitgroups(cid):
    r = execute_sql("select distinct group_id from hits_mv where crawl_id = %s", cid)
    return [g[0] for g in r.fetchall()]
Ejemplo n.º 28
0
def hitgroups(cid):
    r = execute_sql(
        "select distinct group_id from hits_mv where crawl_id = %s", cid)
    return [g[0] for g in r.fetchall()]
Ejemplo n.º 29
0
    def handle(self, **options):
        '''
        Sample data:
groupid|hit_title|requester_name|requester_id|description|keywords|qualifications|hit_expiration_date|time_allotted|reward|hits_available|time_crawled|crawl_id|pageNo|inpage_position|dollars
CW18RQ8BPZWWMWY3DY4Z|Validate the addresses. Great bonus |Product Search|A28JEQTWH76JDT|Given the list of addresses and it's site URL, please verify whether the given addresses exists in the site or not. [ Note: Bonus and Reward Details: 1). For each validated address you will get $0.02 as bonus. Suppose if the input file contains 10 addresses and you have validated all of them then you will get $0.2(i.e., 10*0.02) as bonus + $0.01 as reward. ]|reward,  online,  shopping,  web,  quality,  testing,  relevance,  search,  engine,  US,  India,  browse,  hit |HIT approval rate (%) is not less than 80 |Aug 26, 2009  (33 weeks) |1 day 12 hours|$0.01|295|2009-01-07 14:00:05|1|2|5|0.01

HitGroupContent:
    group_id            = models.CharField('Group ID', max_length=50, db_index=True, unique=True)
    group_id_hashed     = models.BooleanField(default=False)
    requester_id        = models.CharField('Requester ID', max_length=50, db_index=True)
    requester_name      = models.CharField('Requester Name', max_length=500)
    reward              = models.FloatField('Reward')
    html                = models.TextField('HTML', max_length=100000000)
    description         = models.TextField('Description', max_length=1000000)
    title               = models.CharField('Title', max_length=500)
    keywords            = models.CharField('Keywords', blank=True, max_length=500, null=True)
    qualifications      = models.CharField('Qualifications', blank=True, max_length=500, null=True)
    occurrence_date     = models.DateTimeField('First occurrence date', blank=True, null=True, db_index=True)
    time_alloted        = models.IntegerField('Time alloted')
    first_crawl         = models.ForeignKey(Crawl, blank=True, null=True)

HitGroupStatus
    
    group_id            = models.CharField('Group ID',max_length=50, db_index=True)
    hits_available      = models.IntegerField('Hits Avaliable')
    page_number         = models.IntegerField('Page Number')
    inpage_position     = models.IntegerField('In Page Position')
    hit_expiration_date = models.DateTimeField('Hit expiration Date')

    hit_group_content   = models.ForeignKey(HitGroupContent)
    
    crawl               = models.ForeignKey(Crawl)        
    '''
        
        items_per_transaction = 1000
        transaction_count = 0
        i = 0
        hit_group_content_mapping = {}
        crawl_mapping = {}
        
        print 'setting up crawl mappings'
        crawls = Crawl.objects.all().values_list('old_id','pk')
        for row in crawls:
            crawl_mapping[row[0]] = row[1]
            
        del crawls
        
        try:
            i = get_counter('import_hitgroups_line')
        except:
            pass
        
        try:
            f = open(os.path.join(settings.ROOT_PATH,'data','hits.utf8.csv'),"rb")
            error_log = open(os.path.join(settings.ROOT_PATH,'data','error.hits.utf8.csv'),'w')
                            
            '''
            seek to file_position stored in counter
            '''
            try:
                last_position = get_counter('import_hitgroups')
                print 'starting from: %s' % last_position
                f.seek(int(last_position))
            except:
                f.readline()
                print 'coulnd not find last position starting from first line'
                pass
            
            transaction.enter_transaction_management()
            transaction.managed(True)
        
            for row in f:
                try:
                    row = row.strip()
                    group_id, title, requster_name, requester_id, description, keywords, qualifications, hit_expiration_date, time_alloted, reward, hits_available, time_crawled, crawl_id, page_no, inpage_position, dollars =  tuple(row.split('|'))

                    '''                
                    check if there already is a HitGroupContent for this row
                    if HitGroupContent exists do nothin
                    '''
                    
                    reward = float(reward[1:]) # stripiing starting $ ex. $0.1
                    time_alloted = parse_time_alloted(time_alloted) # parsing strings like 4 hours 30 minutes to int minutes
                    crawl_id = int(crawl_id)
                    hits_available = int(hits_available)
                    page_no = int(page_no)
                    inpage_position = int(inpage_position)
                    hashed_group_id = False
                    
                    if group_id == '':
                        group_id = hashlib.md5("%s;%s;%s;%s;%s;%s;%s;" % (title, requester_id,
                                                                         time_alloted,reward,
                                                                         description,keywords,
                                                                         qualifications)).hexdigest()
                        hashed_group_id = True
                        

                    hit_expiration_date = datetime.datetime.strptime(re.sub('\(.*\)', '', hit_expiration_date).strip(), "%b %d, %Y") # Apr 5, 2009  (4 weeks 1 day) 
                    
                    exists = False
                    content_id = execute_sql("select id from main_hitgroupcontent where group_id = '%s'" % group_id).fetchone()
                    if content_id is not None:
                        hit_group_content_mapping[group_id] = content_id[0]
                        exists = True
                    
                    if not exists:
                        '''
                        if not: save new HitGroupContent object store mapping in memmory
                        '''
                        obj = HitGroupContent(group_id_hashed = hashed_group_id, group_id=group_id, requester_id = requester_id, requester_name = requster_name, reward = reward, description = description, title = title, keywords = keywords, qualifications = qualifications, time_alloted = time_alloted, occurrence_date = time_crawled )
                        obj.save()
                        hit_group_content_mapping[group_id] = obj.pk                       

                    '''
                    store hitgroupstatus into db with correct mapping to HitGroupContent
                    '''
                    obj = HitGroupStatus(group_id = group_id, hits_available = hits_available, page_number = page_no, inpage_position = inpage_position, hit_expiration_date = hit_expiration_date, hit_group_content_id = hit_group_content_mapping[group_id], crawl_id = crawl_mapping[crawl_id])
                    obj.save()
                        
                except (ValueError, KeyError):
                    error_info = grab_error(sys.exc_info())
                    error_log.write(row)
                    error_log.write("\r\n")
                    print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value'])
                
                i += 1
                
                '''
                do a transaction per items_per_transaction rows
                when commiting transaction write file position and next crawl_id to counter file
                '''
                if i % items_per_transaction == 0:
                    transaction.commit()
                    transaction_count += 1
                    write_counter('import_hitgroups', f.tell())
                    write_counter('import_hitgroups_line', i)
                    print 'commited %s transaction, line: %s' % (transaction_count, i)

        except KeyboardInterrupt:
            '''
            User stopped script, rollback last data, close file descriptors  exit
            '''        
            transaction.rollback()
            error_log.close()
            f.close()
            exit()
Ejemplo n.º 30
0
def last_crawlids(limit=10):
    r = execute_sql(
        "select crawl_id from hits_mv order by crawl_id desc limit %s", limit)
    return [c[0] for c in r.fetchall()]
Ejemplo n.º 31
0
def last_crawlid():
    return execute_sql(
        "select crawl_id from hits_mv order by crawl_id desc limit 1;"
    ).fetchall()[0][0]