def handle(self, **options):
     
     try:
     
         commit_threshold = 10
         results = query_to_dicts("select id from main_crawl p where not exists(select id from hits_mv where crawl_id = p.id)")
         
         for i, row in enumerate(results):
             
             execute_sql("""insert into hits_mv 
 SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id, ( SELECT main_crawl.start_time
            FROM main_crawl
           WHERE main_crawl.id = p.crawl_id) AS start_time, q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted
    FROM main_hitgroupstatus p
    JOIN main_hitgroupcontent q ON q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id
   WHERE p.crawl_id = %s            
             """ % row['id'])                    
             
             if i % commit_threshold == 0:
                 print datetime.datetime.now(), 'commited after %s crawls' % i
                 execute_sql('commit;')  
         
         
     except:
         error_info = grab_error(sys.exc_info())
         print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value'])
         execute_sql('rollback;')  
Beispiel #2
0
def update_crawl_agregates(commit_threshold=10, only_new = True):

    results = None

    if only_new:
        results = query_to_dicts("select id from main_crawl p where old_id is null and not exists(select id from main_crawlagregates where crawl_id = p.id)")
    else:
        results = query_to_dicts("select id from main_crawl p where not exists(select id from main_crawlagregates where crawl_id = p.id)")

    logging.info("got results")

    for i, row in enumerate(results):
        try:

            execute_sql("""
            INSERT INTO
                main_crawlagregates
            SELECT
                sum(hits_available) as "hits",
                start_time,
                sum(reward * hits_available) as "reward",
                crawl_id,
                nextval('main_crawlagregates_id_seq'),
                count(*) as "count",
                0
            FROM
                (SELECT DISTINCT ON (group_id) * FROM hits_mv WHERE crawl_id = %s) AS p
            GROUP BY
                crawl_id, start_time
            """, row['id'])

            execute_sql("""UPDATE main_crawlagregates 
                set spam_projects = 
                    ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                where crawl_id = %s""" % (row['id'], row['id']) ) 

            print """UPDATE main_crawlagregates 
                set spam_projects = 
                    ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                where crawl_id = %s"""


            logging.info("update agregates for %s" % row['id'])

            if i % commit_threshold == 0:
                logging.info( 'commited after %s crawls' % i )
                execute_sql('commit;')

        except:
            error_info = grab_error(sys.exc_info())
            logging.error('an error occured at crawl_id: %s, %s %s' % (row['id'],error_info['type'], error_info['value']))
            execute_sql('rollback;')


    # delete dummy data
    execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;")
    execute_sql("COMMIT;")
def update_crawl_agregates(commit_threshold=1000, start=None, end=None, clear_existing=False):
    """Creates main_crawlagregates records for hits_mv."""

    def print_status(number, row_id):
        log.info("Commited after %s crawls, last id %s." % (number, row_id))

    clear_existing and start and end and clear_existing_rows(start, end)

    i = 0
    for i, row in enumerate(get_crawls(start=start, end=end)):
        try:
            execute_sql(
                """
            INSERT INTO
                main_crawlagregates (hits, start_time, reward, crawl_id, id,
                    projects, spam_projects)
            SELECT
                sum(hits_available) as "hits",
                start_time,
                sum(reward * hits_available) as "reward",
                crawl_id,
                nextval('main_crawlagregates_id_seq'),
                count(*) as "count",
                count(CASE WHEN is_spam = TRUE then TRUE ELSE NULL END)
            FROM
                (SELECT DISTINCT ON (group_id) * FROM hits_mv
                WHERE crawl_id = %s) AS p
            GROUP BY
                crawl_id, start_time
            """,
                row["id"],
            )

            if i % commit_threshold == 0:
                print_status(i + 1, row["id"])
                transaction.commit_unless_managed()

        except:
            error_info = grab_error(sys.exc_info())
            log.error("an error occured at crawl_id: %s, %s %s" % (row["id"], error_info["type"], error_info["value"]))
            execute_sql("rollback;")

    if i % commit_threshold != 0:
        print_status(i + 1, row["id"])
        transaction.commit_unless_managed()

    # delete dummy data
    execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;", commit=True)
    transaction.commit_unless_managed()
    def handle(self, **options):
        try:
            commit_threshold = 1000
            results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null")

            for i, row in enumerate(results):
                execute_sql("""update main_hitgroupcontent p set first_crawl_id =
                    (select crawl_id from main_hitgroupstatus where group_id = p.group_id order by crawl_id asc LIMIT 1)
                    where
                        id = %s
                """ % row['id'])

                if i % commit_threshold == 0:
                    print datetime.datetime.now(), 'commited after %s rows' % i
                    execute_sql('commit;')
        except:
            error_info = grab_error(sys.exc_info())
            print 'an error occured at: %s line, %s %s' % (i, error_info['type'], error_info['value'])
            execute_sql('rollback;')
 def handle(self, **options):
     
     try:
         
         commit_threshold = 1000
         results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null")
         
         for i, row in enumerate(results):
             
             execute_sql("""update main_hitgroupcontent p set first_crawl_id = 
                 (select crawl_id from main_hitgroupstatus where group_id = p.group_id order by crawl_id asc LIMIT 1)
                 where 
                     id = %s
             """ % row['id'])                    
             
             if i % commit_threshold == 0:
                 print datetime.datetime.now(), 'commited after %s rows' % i
                 execute_sql('commit;')
             
     except:
         error_info = grab_error(sys.exc_info())
         print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value'])
         execute_sql('rollback;')                  
         
Beispiel #6
0
    def handle(self, **options):
        '''
        Sample data:
groupid|hit_title|requester_name|requester_id|description|keywords|qualifications|hit_expiration_date|time_allotted|reward|hits_available|time_crawled|crawl_id|pageNo|inpage_position|dollars
CW18RQ8BPZWWMWY3DY4Z|Validate the addresses. Great bonus |Product Search|A28JEQTWH76JDT|Given the list of addresses and it's site URL, please verify whether the given addresses exists in the site or not. [ Note: Bonus and Reward Details: 1). For each validated address you will get $0.02 as bonus. Suppose if the input file contains 10 addresses and you have validated all of them then you will get $0.2(i.e., 10*0.02) as bonus + $0.01 as reward. ]|reward,  online,  shopping,  web,  quality,  testing,  relevance,  search,  engine,  US,  India,  browse,  hit |HIT approval rate (%) is not less than 80 |Aug 26, 2009  (33 weeks) |1 day 12 hours|$0.01|295|2009-01-07 14:00:05|1|2|5|0.01

HitGroupContent:
    group_id            = models.CharField('Group ID', max_length=50, db_index=True, unique=True)
    group_id_hashed     = models.BooleanField(default=False)
    requester_id        = models.CharField('Requester ID', max_length=50, db_index=True)
    requester_name      = models.CharField('Requester Name', max_length=500)
    reward              = models.FloatField('Reward')
    html                = models.TextField('HTML', max_length=100000000)
    description         = models.TextField('Description', max_length=1000000)
    title               = models.CharField('Title', max_length=500)
    keywords            = models.CharField('Keywords', blank=True, max_length=500, null=True)
    qualifications      = models.CharField('Qualifications', blank=True, max_length=500, null=True)
    occurrence_date     = models.DateTimeField('First occurrence date', blank=True, null=True, db_index=True)
    time_alloted        = models.IntegerField('Time alloted')
    first_crawl         = models.ForeignKey(Crawl, blank=True, null=True)

HitGroupStatus
    
    group_id            = models.CharField('Group ID',max_length=50, db_index=True)
    hits_available      = models.IntegerField('Hits Avaliable')
    page_number         = models.IntegerField('Page Number')
    inpage_position     = models.IntegerField('In Page Position')
    hit_expiration_date = models.DateTimeField('Hit expiration Date')

    hit_group_content   = models.ForeignKey(HitGroupContent)
    
    crawl               = models.ForeignKey(Crawl)        
    '''
        
        items_per_transaction = 1000
        transaction_count = 0
        i = 0
        hit_group_content_mapping = {}
        crawl_mapping = {}
        
        print 'setting up crawl mappings'
        crawls = Crawl.objects.all().values_list('old_id','pk')
        for row in crawls:
            crawl_mapping[row[0]] = row[1]
            
        del crawls
        
        try:
            i = get_counter('import_hitgroups_line')
        except:
            pass
        
        try:
            f = open(os.path.join(settings.ROOT_PATH,'data','hits.utf8.csv'),"rb")
            error_log = open(os.path.join(settings.ROOT_PATH,'data','error.hits.utf8.csv'),'w')
                            
            '''
            seek to file_position stored in counter
            '''
            try:
                last_position = get_counter('import_hitgroups')
                print 'starting from: %s' % last_position
                f.seek(int(last_position))
            except:
                f.readline()
                print 'coulnd not find last position starting from first line'
                pass
            
            transaction.enter_transaction_management()
            transaction.managed(True)
        
            for row in f:
                try:
                    row = row.strip()
                    group_id, title, requster_name, requester_id, description, keywords, qualifications, hit_expiration_date, time_alloted, reward, hits_available, time_crawled, crawl_id, page_no, inpage_position, dollars =  tuple(row.split('|'))

                    '''                
                    check if there already is a HitGroupContent for this row
                    if HitGroupContent exists do nothin
                    '''
                    
                    reward = float(reward[1:]) # stripiing starting $ ex. $0.1
                    time_alloted = parse_time_alloted(time_alloted) # parsing strings like 4 hours 30 minutes to int minutes
                    crawl_id = int(crawl_id)
                    hits_available = int(hits_available)
                    page_no = int(page_no)
                    inpage_position = int(inpage_position)
                    hashed_group_id = False
                    
                    if group_id == '':
                        group_id = hashlib.md5("%s;%s;%s;%s;%s;%s;%s;" % (title, requester_id,
                                                                         time_alloted,reward,
                                                                         description,keywords,
                                                                         qualifications)).hexdigest()
                        hashed_group_id = True
                        

                    hit_expiration_date = datetime.datetime.strptime(re.sub('\(.*\)', '', hit_expiration_date).strip(), "%b %d, %Y") # Apr 5, 2009  (4 weeks 1 day) 
                    
                    exists = False
                    content_id = execute_sql("select id from main_hitgroupcontent where group_id = '%s'" % group_id).fetchone()
                    if content_id is not None:
                        hit_group_content_mapping[group_id] = content_id[0]
                        exists = True
                    
                    if not exists:
                        '''
                        if not: save new HitGroupContent object store mapping in memmory
                        '''
                        obj = HitGroupContent(group_id_hashed = hashed_group_id, group_id=group_id, requester_id = requester_id, requester_name = requster_name, reward = reward, description = description, title = title, keywords = keywords, qualifications = qualifications, time_alloted = time_alloted, occurrence_date = time_crawled )
                        obj.save()
                        hit_group_content_mapping[group_id] = obj.pk                       

                    '''
                    store hitgroupstatus into db with correct mapping to HitGroupContent
                    '''
                    obj = HitGroupStatus(group_id = group_id, hits_available = hits_available, page_number = page_no, inpage_position = inpage_position, hit_expiration_date = hit_expiration_date, hit_group_content_id = hit_group_content_mapping[group_id], crawl_id = crawl_mapping[crawl_id])
                    obj.save()
                        
                except (ValueError, KeyError):
                    error_info = grab_error(sys.exc_info())
                    error_log.write(row)
                    error_log.write("\r\n")
                    print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value'])
                
                i += 1
                
                '''
                do a transaction per items_per_transaction rows
                when commiting transaction write file position and next crawl_id to counter file
                '''
                if i % items_per_transaction == 0:
                    transaction.commit()
                    transaction_count += 1
                    write_counter('import_hitgroups', f.tell())
                    write_counter('import_hitgroups_line', i)
                    print 'commited %s transaction, line: %s' % (transaction_count, i)

        except KeyboardInterrupt:
            '''
            User stopped script, rollback last data, close file descriptors  exit
            '''        
            transaction.rollback()
            error_log.close()
            f.close()
            exit()
Beispiel #7
0
def update_crawl_agregates(commit_threshold=10, only_new=True):

    results = None

    if only_new:
        results = query_to_dicts(
            "select id from main_crawl p where old_id is null and not exists(select id from main_crawlagregates where crawl_id = p.id)"
        )
    else:
        results = query_to_dicts(
            "select id from main_crawl p where not exists(select id from main_crawlagregates where crawl_id = p.id)"
        )

    logging.info("got results")

    for i, row in enumerate(results):
        try:

            execute_sql(
                """
            INSERT INTO
                main_crawlagregates
            SELECT
                sum(hits_available) as "hits",
                start_time,
                sum(reward * hits_available) as "reward",
                crawl_id,
                nextval('main_crawlagregates_id_seq'),
                count(*) as "count",
                0
            FROM
                (SELECT DISTINCT ON (group_id) * FROM hits_mv WHERE crawl_id = %s) AS p
            GROUP BY
                crawl_id, start_time
            """, row['id'])

            execute_sql("""UPDATE main_crawlagregates 
                set spam_projects = 
                    ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                where crawl_id = %s""" % (row['id'], row['id']))

            print """UPDATE main_crawlagregates 
                set spam_projects = 
                    ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                where crawl_id = %s"""

            logging.info("update agregates for %s" % row['id'])

            if i % commit_threshold == 0:
                logging.info('commited after %s crawls' % i)
                execute_sql('commit;')

        except:
            error_info = grab_error(sys.exc_info())
            logging.error('an error occured at crawl_id: %s, %s %s' %
                          (row['id'], error_info['type'], error_info['value']))
            execute_sql('rollback;')

    # delete dummy data
    execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;")
    execute_sql("COMMIT;")
Beispiel #8
0
def update_crawl_agregates(commit_threshold=1000, only_new=True):
    """Creates main_crawlagregates records for hits_mv."""

    def print_status(number, row_id):
        log.info('Commited after %s crawls, last id %s.' % (i, row_id))

    results = None

    query = """SELECT id FROM main_crawl p WHERE {0}
        groups_available * 0.9 < groups_downloaded AND
        NOT exists(SELECT id FROM main_crawlagregates WHERE crawl_id = p.id)
    """
    query = query.format('old_id is NULL AND' if only_new else '')
    results = query_to_dicts(query)

    log.info("Fetched crawls to process.")

    i = 0
    for i, row in enumerate(results):
        try:
            execute_sql("""
            INSERT INTO
                main_crawlagregates (hits, start_time, reward, crawl_id, id,
                    projects, spam_projects)
            SELECT
                sum(hits_available) as "hits",
                start_time,
                sum(reward * hits_available) as "reward",
                crawl_id,
                nextval('main_crawlagregates_id_seq'),
                count(*) as "count",
                0
            FROM
                (SELECT DISTINCT ON (group_id) * FROM hits_mv
                WHERE crawl_id = %s) AS p
            GROUP BY
                crawl_id, start_time
            """, row['id'])

            execute_sql("""UPDATE main_crawlagregates
                set spam_projects =
                    (select count(*) from hits_mv
                    where crawl_id = %s and is_spam = true)
                where crawl_id = %s""" % (row['id'], row['id']))

            if i % commit_threshold == 0:
                print_status(i, row['id'])
                execute_sql('commit;')

        except:
            error_info = grab_error(sys.exc_info())
            log.error('an error occured at crawl_id: %s, %s %s' % (
                row['id'], error_info['type'], error_info['value']))
            execute_sql('rollback;')

    if i % commit_threshold != 0:
        print_status(i, row['id'])
        execute_sql('commit;')

    # delete dummy data
    execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;")
    execute_sql("COMMIT;")