def handle(self, **options): try: commit_threshold = 10 results = query_to_dicts("select id from main_crawl p where not exists(select id from hits_mv where crawl_id = p.id)") for i, row in enumerate(results): execute_sql("""insert into hits_mv SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id, ( SELECT main_crawl.start_time FROM main_crawl WHERE main_crawl.id = p.crawl_id) AS start_time, q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted FROM main_hitgroupstatus p JOIN main_hitgroupcontent q ON q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id WHERE p.crawl_id = %s """ % row['id']) if i % commit_threshold == 0: print datetime.datetime.now(), 'commited after %s crawls' % i execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value']) execute_sql('rollback;')
def update_crawl_agregates(commit_threshold=10, only_new = True): results = None if only_new: results = query_to_dicts("select id from main_crawl p where old_id is null and not exists(select id from main_crawlagregates where crawl_id = p.id)") else: results = query_to_dicts("select id from main_crawl p where not exists(select id from main_crawlagregates where crawl_id = p.id)") logging.info("got results") for i, row in enumerate(results): try: execute_sql(""" INSERT INTO main_crawlagregates SELECT sum(hits_available) as "hits", start_time, sum(reward * hits_available) as "reward", crawl_id, nextval('main_crawlagregates_id_seq'), count(*) as "count", 0 FROM (SELECT DISTINCT ON (group_id) * FROM hits_mv WHERE crawl_id = %s) AS p GROUP BY crawl_id, start_time """, row['id']) execute_sql("""UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" % (row['id'], row['id']) ) print """UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" logging.info("update agregates for %s" % row['id']) if i % commit_threshold == 0: logging.info( 'commited after %s crawls' % i ) execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) logging.error('an error occured at crawl_id: %s, %s %s' % (row['id'],error_info['type'], error_info['value'])) execute_sql('rollback;') # delete dummy data execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;") execute_sql("COMMIT;")
def update_crawl_agregates(commit_threshold=1000, start=None, end=None, clear_existing=False): """Creates main_crawlagregates records for hits_mv.""" def print_status(number, row_id): log.info("Commited after %s crawls, last id %s." % (number, row_id)) clear_existing and start and end and clear_existing_rows(start, end) i = 0 for i, row in enumerate(get_crawls(start=start, end=end)): try: execute_sql( """ INSERT INTO main_crawlagregates (hits, start_time, reward, crawl_id, id, projects, spam_projects) SELECT sum(hits_available) as "hits", start_time, sum(reward * hits_available) as "reward", crawl_id, nextval('main_crawlagregates_id_seq'), count(*) as "count", count(CASE WHEN is_spam = TRUE then TRUE ELSE NULL END) FROM (SELECT DISTINCT ON (group_id) * FROM hits_mv WHERE crawl_id = %s) AS p GROUP BY crawl_id, start_time """, row["id"], ) if i % commit_threshold == 0: print_status(i + 1, row["id"]) transaction.commit_unless_managed() except: error_info = grab_error(sys.exc_info()) log.error("an error occured at crawl_id: %s, %s %s" % (row["id"], error_info["type"], error_info["value"])) execute_sql("rollback;") if i % commit_threshold != 0: print_status(i + 1, row["id"]) transaction.commit_unless_managed() # delete dummy data execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;", commit=True) transaction.commit_unless_managed()
def handle(self, **options): try: commit_threshold = 1000 results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null") for i, row in enumerate(results): execute_sql("""update main_hitgroupcontent p set first_crawl_id = (select crawl_id from main_hitgroupstatus where group_id = p.group_id order by crawl_id asc LIMIT 1) where id = %s """ % row['id']) if i % commit_threshold == 0: print datetime.datetime.now(), 'commited after %s rows' % i execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) print 'an error occured at: %s line, %s %s' % (i, error_info['type'], error_info['value']) execute_sql('rollback;')
def handle(self, **options): try: commit_threshold = 1000 results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null") for i, row in enumerate(results): execute_sql("""update main_hitgroupcontent p set first_crawl_id = (select crawl_id from main_hitgroupstatus where group_id = p.group_id order by crawl_id asc LIMIT 1) where id = %s """ % row['id']) if i % commit_threshold == 0: print datetime.datetime.now(), 'commited after %s rows' % i execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value']) execute_sql('rollback;')
def handle(self, **options): ''' Sample data: groupid|hit_title|requester_name|requester_id|description|keywords|qualifications|hit_expiration_date|time_allotted|reward|hits_available|time_crawled|crawl_id|pageNo|inpage_position|dollars CW18RQ8BPZWWMWY3DY4Z|Validate the addresses. Great bonus |Product Search|A28JEQTWH76JDT|Given the list of addresses and it's site URL, please verify whether the given addresses exists in the site or not. [ Note: Bonus and Reward Details: 1). For each validated address you will get $0.02 as bonus. Suppose if the input file contains 10 addresses and you have validated all of them then you will get $0.2(i.e., 10*0.02) as bonus + $0.01 as reward. ]|reward, online, shopping, web, quality, testing, relevance, search, engine, US, India, browse, hit |HIT approval rate (%) is not less than 80 |Aug 26, 2009 (33 weeks) |1 day 12 hours|$0.01|295|2009-01-07 14:00:05|1|2|5|0.01 HitGroupContent: group_id = models.CharField('Group ID', max_length=50, db_index=True, unique=True) group_id_hashed = models.BooleanField(default=False) requester_id = models.CharField('Requester ID', max_length=50, db_index=True) requester_name = models.CharField('Requester Name', max_length=500) reward = models.FloatField('Reward') html = models.TextField('HTML', max_length=100000000) description = models.TextField('Description', max_length=1000000) title = models.CharField('Title', max_length=500) keywords = models.CharField('Keywords', blank=True, max_length=500, null=True) qualifications = models.CharField('Qualifications', blank=True, max_length=500, null=True) occurrence_date = models.DateTimeField('First occurrence date', blank=True, null=True, db_index=True) time_alloted = models.IntegerField('Time alloted') first_crawl = models.ForeignKey(Crawl, blank=True, null=True) HitGroupStatus group_id = models.CharField('Group ID',max_length=50, db_index=True) hits_available = models.IntegerField('Hits Avaliable') page_number = models.IntegerField('Page Number') inpage_position = models.IntegerField('In Page Position') hit_expiration_date = models.DateTimeField('Hit expiration Date') hit_group_content = models.ForeignKey(HitGroupContent) crawl = models.ForeignKey(Crawl) ''' items_per_transaction = 1000 transaction_count = 0 i = 0 hit_group_content_mapping = {} crawl_mapping = {} print 'setting up crawl mappings' crawls = Crawl.objects.all().values_list('old_id','pk') for row in crawls: crawl_mapping[row[0]] = row[1] del crawls try: i = get_counter('import_hitgroups_line') except: pass try: f = open(os.path.join(settings.ROOT_PATH,'data','hits.utf8.csv'),"rb") error_log = open(os.path.join(settings.ROOT_PATH,'data','error.hits.utf8.csv'),'w') ''' seek to file_position stored in counter ''' try: last_position = get_counter('import_hitgroups') print 'starting from: %s' % last_position f.seek(int(last_position)) except: f.readline() print 'coulnd not find last position starting from first line' pass transaction.enter_transaction_management() transaction.managed(True) for row in f: try: row = row.strip() group_id, title, requster_name, requester_id, description, keywords, qualifications, hit_expiration_date, time_alloted, reward, hits_available, time_crawled, crawl_id, page_no, inpage_position, dollars = tuple(row.split('|')) ''' check if there already is a HitGroupContent for this row if HitGroupContent exists do nothin ''' reward = float(reward[1:]) # stripiing starting $ ex. $0.1 time_alloted = parse_time_alloted(time_alloted) # parsing strings like 4 hours 30 minutes to int minutes crawl_id = int(crawl_id) hits_available = int(hits_available) page_no = int(page_no) inpage_position = int(inpage_position) hashed_group_id = False if group_id == '': group_id = hashlib.md5("%s;%s;%s;%s;%s;%s;%s;" % (title, requester_id, time_alloted,reward, description,keywords, qualifications)).hexdigest() hashed_group_id = True hit_expiration_date = datetime.datetime.strptime(re.sub('\(.*\)', '', hit_expiration_date).strip(), "%b %d, %Y") # Apr 5, 2009 (4 weeks 1 day) exists = False content_id = execute_sql("select id from main_hitgroupcontent where group_id = '%s'" % group_id).fetchone() if content_id is not None: hit_group_content_mapping[group_id] = content_id[0] exists = True if not exists: ''' if not: save new HitGroupContent object store mapping in memmory ''' obj = HitGroupContent(group_id_hashed = hashed_group_id, group_id=group_id, requester_id = requester_id, requester_name = requster_name, reward = reward, description = description, title = title, keywords = keywords, qualifications = qualifications, time_alloted = time_alloted, occurrence_date = time_crawled ) obj.save() hit_group_content_mapping[group_id] = obj.pk ''' store hitgroupstatus into db with correct mapping to HitGroupContent ''' obj = HitGroupStatus(group_id = group_id, hits_available = hits_available, page_number = page_no, inpage_position = inpage_position, hit_expiration_date = hit_expiration_date, hit_group_content_id = hit_group_content_mapping[group_id], crawl_id = crawl_mapping[crawl_id]) obj.save() except (ValueError, KeyError): error_info = grab_error(sys.exc_info()) error_log.write(row) error_log.write("\r\n") print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value']) i += 1 ''' do a transaction per items_per_transaction rows when commiting transaction write file position and next crawl_id to counter file ''' if i % items_per_transaction == 0: transaction.commit() transaction_count += 1 write_counter('import_hitgroups', f.tell()) write_counter('import_hitgroups_line', i) print 'commited %s transaction, line: %s' % (transaction_count, i) except KeyboardInterrupt: ''' User stopped script, rollback last data, close file descriptors exit ''' transaction.rollback() error_log.close() f.close() exit()
def update_crawl_agregates(commit_threshold=10, only_new=True): results = None if only_new: results = query_to_dicts( "select id from main_crawl p where old_id is null and not exists(select id from main_crawlagregates where crawl_id = p.id)" ) else: results = query_to_dicts( "select id from main_crawl p where not exists(select id from main_crawlagregates where crawl_id = p.id)" ) logging.info("got results") for i, row in enumerate(results): try: execute_sql( """ INSERT INTO main_crawlagregates SELECT sum(hits_available) as "hits", start_time, sum(reward * hits_available) as "reward", crawl_id, nextval('main_crawlagregates_id_seq'), count(*) as "count", 0 FROM (SELECT DISTINCT ON (group_id) * FROM hits_mv WHERE crawl_id = %s) AS p GROUP BY crawl_id, start_time """, row['id']) execute_sql("""UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" % (row['id'], row['id'])) print """UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" logging.info("update agregates for %s" % row['id']) if i % commit_threshold == 0: logging.info('commited after %s crawls' % i) execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) logging.error('an error occured at crawl_id: %s, %s %s' % (row['id'], error_info['type'], error_info['value'])) execute_sql('rollback;') # delete dummy data execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;") execute_sql("COMMIT;")
def update_crawl_agregates(commit_threshold=1000, only_new=True): """Creates main_crawlagregates records for hits_mv.""" def print_status(number, row_id): log.info('Commited after %s crawls, last id %s.' % (i, row_id)) results = None query = """SELECT id FROM main_crawl p WHERE {0} groups_available * 0.9 < groups_downloaded AND NOT exists(SELECT id FROM main_crawlagregates WHERE crawl_id = p.id) """ query = query.format('old_id is NULL AND' if only_new else '') results = query_to_dicts(query) log.info("Fetched crawls to process.") i = 0 for i, row in enumerate(results): try: execute_sql(""" INSERT INTO main_crawlagregates (hits, start_time, reward, crawl_id, id, projects, spam_projects) SELECT sum(hits_available) as "hits", start_time, sum(reward * hits_available) as "reward", crawl_id, nextval('main_crawlagregates_id_seq'), count(*) as "count", 0 FROM (SELECT DISTINCT ON (group_id) * FROM hits_mv WHERE crawl_id = %s) AS p GROUP BY crawl_id, start_time """, row['id']) execute_sql("""UPDATE main_crawlagregates set spam_projects = (select count(*) from hits_mv where crawl_id = %s and is_spam = true) where crawl_id = %s""" % (row['id'], row['id'])) if i % commit_threshold == 0: print_status(i, row['id']) execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) log.error('an error occured at crawl_id: %s, %s %s' % ( row['id'], error_info['type'], error_info['value'])) execute_sql('rollback;') if i % commit_threshold != 0: print_status(i, row['id']) execute_sql('commit;') # delete dummy data execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;") execute_sql("COMMIT;")