def update_first_occured_agregates(): missing_crawls = query_to_tuples( """select id from main_crawl p where p.success = true and not exists (select crawl_id from main_hitgroupfirstoccurences where crawl_id = p.id );""" ) for row in missing_crawls: crawl_id = row[0] logging.info( "inserting missing crawl into main_hitgroupfirstoccurences: %s" % crawl_id) execute_sql("""INSERT INTO main_hitgroupfirstoccurences select p.reward, p.id, q.crawl_id, p.requester_name, q.id, p.occurrence_date, p.requester_id, p.group_id, q.hits_available, nextval('main_hitgroupfirstoccurences_id_seq'::regclass) from main_hitgroupcontent p join main_hitgroupstatus q on( p.first_crawl_id = q.crawl_id and q.hit_group_content_id = p.id ) where q.crawl_id = %s;""" % crawl_id) execute_sql('commit;')
def update_diffs(limit=100): start_time = time.time() execute_sql( """ UPDATE hits_mv SET hits_diff = diffs.hits_diff FROM (SELECT group_id, crawl_id, hits_available - COALESCE(lag(hits_available) over (partition BY group_id ORDER BY crawl_id), 0) AS hits_diff FROM hits_mv WHERE hits_diff is NULL AND crawl_id in (SELECT DISTINCT crawl_id FROM hits_mv WHERE hits_diff is NULL ORDER BY crawl_id DESC LIMIT %s) ) AS diffs WHERE (diffs.group_id = hits_mv.group_id) AND (diffs.crawl_id = hits_mv.crawl_id);""", (int(limit), )) execute_sql('commit;') log.debug('Updated diffs for %s crawls in %s\n%s', limit, (time.time() - start_time))
def update_first_occured_agregates(): missing_crawls = query_to_tuples("""select id from main_crawl p where p.success = true and not exists (select crawl_id from main_hitgroupfirstoccurences where crawl_id = p.id );""") for row in missing_crawls: crawl_id = row[0] logging.info("inserting missing crawl into main_hitgroupfirstoccurences: %s" % crawl_id) execute_sql("""INSERT INTO main_hitgroupfirstoccurences select p.reward, p.id, q.crawl_id, p.requester_name, q.id, p.occurrence_date, p.requester_id, p.group_id, q.hits_available, nextval('main_hitgroupfirstoccurences_id_seq'::regclass) from main_hitgroupcontent p join main_hitgroupstatus q on( p.first_crawl_id = q.crawl_id and q.hit_group_content_id = p.id ) where q.crawl_id = %s;""" % crawl_id) execute_sql('commit;')
def update_diffs(limit=100): start_time = time.time() execute_sql(""" UPDATE hits_mv SET hits_diff = diffs.hits_diff FROM (SELECT group_id, crawl_id, hits_available - COALESCE(lag(hits_available) over (partition BY group_id ORDER BY crawl_id), 0) AS hits_diff FROM hits_mv WHERE hits_diff is NULL AND crawl_id in (SELECT DISTINCT crawl_id FROM hits_mv WHERE hits_diff is NULL ORDER BY crawl_id DESC LIMIT %s) ) AS diffs WHERE (diffs.group_id = hits_mv.group_id) AND (diffs.crawl_id = hits_mv.crawl_id);""", (int(limit), )) execute_sql('commit;') log.debug('Updated diffs for %s crawls in %s\n%s', limit, (time.time() - start_time))
def updatehitgroup(g, cid): prev = execute_sql("""select hits_available from hits_mv where crawl_id between %s and %s and group_id = '%s' order by crawl_id desc limit 1;""" % (cid-100, cid-1, g)).fetchall() prev = prev[0][0] if prev else 0 execute_sql("""update hits_mv set hits_diff = hits_available - %s where group_id = '%s' and crawl_id = %s;""" % (prev, g, cid))
def updatehitgroup(g, cid): prev = execute_sql("""select hits_available from hits_mv where crawl_id between %s and %s and group_id = '%s' order by crawl_id desc limit 1;""" % (cid - 100, cid - 1, g)).fetchall() prev = prev[0][0] if prev else 0 execute_sql("""update hits_mv set hits_diff = hits_available - %s where group_id = '%s' and crawl_id = %s;""" % (prev, g, cid))
def update_cid(cid): st = time.time() count = 0 for i, g in enumerate(query_to_tuples("select distinct group_id from hits_mv where crawl_id = %s", cid)): g = g[0] log.info("processing %s, %s %s", i, cid, g) updatehitgroup(g, cid) count += 1 execute_sql("commit;") log.info("updated crawl in %s", time.time() - st) return count
def topreq_data(days): from tenclouds.sql import query_to_tuples, execute_sql start_time = time.time() firstcrawl = execute_sql( """ SELECT crawl_id FROM hits_mv WHERE start_time > %s ORDER BY start_time ASC LIMIT 1;""", datetime.date.today() - datetime.timedelta(int(days))).fetchall()[0][0] return list( query_to_tuples(""" SELECT h.requester_id, h.requester_name, count(*) as "projects", sum(mv.hits_available) as "hits", sum(mv.hits_available*h.reward) as "reward", max(h.occurrence_date) as "last_posted" FROM main_hitgroupcontent h LEFT JOIN main_requesterprofile p ON h.requester_id = p.requester_id LEFT JOIN ( SELECT group_id, crawl_id, hits_available from hits_mv where crawl_id> %s ) mv ON (h.group_id=mv.group_id and h.first_crawl_id=mv.crawl_id) WHERE h.first_crawl_id > %s AND coalesce(p.is_public, true) = true group by h.requester_id, h.requester_name order by sum(mv.hits_available*h.reward) desc limit 1000;""" % (firstcrawl, firstcrawl)))
def topreq_data(days): from tenclouds.sql import query_to_tuples, execute_sql start_time = time.time() firstcrawl = execute_sql(""" SELECT crawl_id FROM hits_mv WHERE start_time > %s ORDER BY start_time ASC LIMIT 1;""", datetime.date.today() - datetime.timedelta(int(days))).fetchall()[0][0] return list(query_to_tuples(""" SELECT h.requester_id, h.requester_name, count(*) as "projects", sum(mv.hits_available) as "hits", sum(mv.hits_available*h.reward) as "reward", max(h.occurrence_date) as "last_posted" FROM main_hitgroupcontent h LEFT JOIN main_requesterprofile p ON h.requester_id = p.requester_id LEFT JOIN ( SELECT group_id, crawl_id, hits_available from hits_mv where crawl_id> %s ) mv ON (h.group_id=mv.group_id and h.first_crawl_id=mv.crawl_id) WHERE h.first_crawl_id > %s AND coalesce(p.is_public, true) = true group by h.requester_id, h.requester_name order by sum(mv.hits_available*h.reward) desc limit 1000;""" % (firstcrawl, firstcrawl)))
def update_cid(cid): st = time.time() count = 0 for i, g in enumerate( query_to_tuples( "select distinct group_id from hits_mv where crawl_id = %s", cid)): g = g[0] log.info("processing %s, %s %s", i, cid, g) updatehitgroup(g, cid) count += 1 execute_sql("commit;") log.info("updated crawl in %s", time.time() - st) return count
def handle(self, **options): update_crawl_agregates(only_new = False) f = open(os.path.join(settings.ROOT_PATH,'crawl.errors.csv'),"rb") progress = 10 execute_sql("update main_crawl set success = true where old_id is not null") for i,id in enumerate(f): id = id.strip() execute_sql("delete from main_crawlagregates where crawl_id = (select id from main_crawl where old_id = %s)" % id) execute_sql("update main_crawl set success = false where old_id = %s" % id) if i % progress == 0: print "processed %s rows" % i execute_sql("commit;") execute_sql("commit;")
def handle(self, **options): try: commit_threshold = 10 results = query_to_dicts("select id from main_crawl p where not exists(select id from hits_mv where crawl_id = p.id)") for i, row in enumerate(results): execute_sql("""insert into hits_mv SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id, ( SELECT main_crawl.start_time FROM main_crawl WHERE main_crawl.id = p.crawl_id) AS start_time, q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted FROM main_hitgroupstatus p JOIN main_hitgroupcontent q ON q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id WHERE p.crawl_id = %s """ % row['id']) if i % commit_threshold == 0: print datetime.datetime.now(), 'commited after %s crawls' % i execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value']) execute_sql('rollback;')
def handle(self, **options): update_crawl_agregates(only_new=False) f = open(os.path.join(settings.ROOT_PATH, 'crawl.errors.csv'), "rb") progress = 10 execute_sql( "update main_crawl set success = true where old_id is not null") for i, id in enumerate(f): id = id.strip() execute_sql( "delete from main_crawlagregates where crawl_id = (select id from main_crawl where old_id = %s)" % id) execute_sql( "update main_crawl set success = false where old_id = %s" % id) if i % progress == 0: print "processed %s rows" % i execute_sql("commit;") execute_sql("commit;")
def update_mviews(): missing_crawls = query_to_tuples("""select id, start_time from main_crawl p where p.success = true and not exists (select id from main_crawlagregates where crawl_id = p.id ) and old_id is null and groups_downloaded > 200 and has_hits_mv = false order by id desc""") for row in missing_crawls: crawl_id, start_time = row logging.info("inserting missing crawl: %s" % crawl_id) execute_sql("delete from hits_mv where crawl_id = %s" % crawl_id) execute_sql("""INSERT INTO hits_mv SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id, TIMESTAMP '%s', q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted, null, q.is_spam FROM main_hitgroupstatus p JOIN main_hitgroupcontent q ON (q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id) WHERE p.crawl_id = %s; """ % (start_time, crawl_id)) execute_sql("update main_crawl set has_hits_mv = true where id = %s" % crawl_id) execute_sql('commit;')
def calculate_first_crawl_id(): progress = 10 results = query_to_dicts( "select id from main_hitgroupcontent where first_crawl_id is null") logging.info('got missing ids results') for i, r in enumerate(results): logging.info("\tprocessing %s" % r['id']) execute_sql("""update main_hitgroupcontent p set first_crawl_id = (select min(crawl_id) from main_hitgroupstatus where hit_group_content_id = p.id) where id = %s """ % r['id']) if i % progress == 0: execute_sql('commit;') logging.info( "updated %s main_hitgroupcontent rows with first_crawl_id" % i) execute_sql('commit;')
def calculate_first_crawl_id(): progress = 10 results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null") logging.info('got missing ids results') for i,r in enumerate(results): logging.info("\tprocessing %s" % r['id']) execute_sql("""update main_hitgroupcontent p set first_crawl_id = (select min(crawl_id) from main_hitgroupstatus where hit_group_content_id = p.id) where id = %s """ % r['id']) if i % progress == 0: execute_sql('commit;') logging.info("updated %s main_hitgroupcontent rows with first_crawl_id" % i) execute_sql('commit;')
def clean_duplicates(): ids = query_to_dicts("select group_id from main_hitgroupcontent group by group_id having count(*) > 1;") for id in ids: print "deleting %s" % id['group_id'] logging.info( "deleting %s" % id['group_id'] ) execute_sql("""delete from main_hitgroupstatus where hit_group_content_id in ( select id from main_hitgroupcontent where id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s'); """ % (id['group_id'], id['group_id'])) execute_sql("""delete from main_hitgroupcontent where id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s' """ % (id['group_id'], id['group_id'])) execute_sql('commit;')
def clean_duplicates(): ids = query_to_dicts( "select group_id from main_hitgroupcontent group by group_id having count(*) > 1;" ) for id in ids: print "deleting %s" % id['group_id'] logging.info("deleting %s" % id['group_id']) execute_sql("""delete from main_hitgroupstatus where hit_group_content_id in ( select id from main_hitgroupcontent where id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s'); """ % (id['group_id'], id['group_id'])) execute_sql("""delete from main_hitgroupcontent where id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s' """ % (id['group_id'], id['group_id'])) execute_sql('commit;')
def handle(self, **options): try: commit_threshold = 1000 results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null") for i, row in enumerate(results): execute_sql("""update main_hitgroupcontent p set first_crawl_id = (select crawl_id from main_hitgroupstatus where group_id = p.group_id order by crawl_id asc LIMIT 1) where id = %s """ % row['id']) if i % commit_threshold == 0: print datetime.datetime.now(), 'commited after %s rows' % i execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value']) execute_sql('rollback;')
def last_crawlids(limit=10): r = execute_sql("select crawl_id from hits_mv order by crawl_id desc limit %s", limit) return [c[0] for c in r.fetchall()]
def handle(self, **options): """ Take ${lmit} last crawls without spam classification Classify all hit groups, update hits_mv to have proper hit classification Rebuild crawl_aggregates for a given crawl Refresh memcache """ service = get_prediction_service() pid = Pid('classify_spam', True) transaction.enter_transaction_management() transaction.managed(True) start_time = time.time() try: number_of_predictions = 0 for c in list(Crawl.objects.filter(is_spam_computed=False).order_by('-id')[:options['limit']]): log.info("processing %s", c) spam = set([]) not_spam = set([]) updated = 0 for row in query_to_dicts("""select content_id, group_id, is_spam from hits_mv where crawl_id = %s""", c.id): log.info("classyfing crawl_id: %s, %s", c.id,row) if row['is_spam'] is None: is_spam = None content = HitGroupContent.objects.get(id= row['content_id']) if content.is_spam is None: data = content.prepare_for_prediction() body = {'input': {'csvInstance': data}} prediction = service.predict(body=body, data=options['file']).execute() number_of_predictions += 1 updated += 1 content.is_spam = prediction['outputLabel'] != 'No' content.save() execute_sql("update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'" % ('true' if content.is_spam else 'false', c.id, row['group_id'])) transaction.commit() if content.is_spam: log.info("detected spam for %s", row) spam.add(str(row['content_id'])) else: not_spam.add(str(row['content_id'])) else: log.info("is_spam already computed for %s" % row) if updated > 0: c.is_spam_computed=True c.save() log.info("done classyfing crawl") execute_sql("""UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" % (c.id, c.id) ) transaction.commit() log.info("dome processing %s", c) except (KeyError, KeyboardInterrupt, HttpError), e: log.error(e) transaction.rollback() pid.remove_pid() exit()
def handle(self, **options): """ Take ${lmit} last crawls without spam classification Classify all hit groups, update hits_mv to have proper hit classification Rebuild crawl_aggregates for a given crawl Refresh memcache """ service = get_prediction_service() pid = Pid('classify_spam', True) transaction.enter_transaction_management() transaction.managed(True) start_time = time.time() try: number_of_predictions = 0 for c in list( Crawl.objects.filter(is_spam_computed=False).order_by( '-id')[:options['limit']]): log.info("processing %s", c) spam = set([]) not_spam = set([]) updated = 0 for row in query_to_dicts( """select content_id, group_id, is_spam from hits_mv where crawl_id = %s""", c.id): log.info("classyfing crawl_id: %s, %s", c.id, row) if row['is_spam'] is None: is_spam = None content = HitGroupContent.objects.get( id=row['content_id']) if content.is_spam is None: data = content.prepare_for_prediction() body = {'input': {'csvInstance': data}} prediction = service.predict( body=body, data=options['file']).execute() number_of_predictions += 1 updated += 1 content.is_spam = prediction['outputLabel'] != 'No' content.save() execute_sql( "update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'" % ('true' if content.is_spam else 'false', c.id, row['group_id'])) transaction.commit() if content.is_spam: log.info("detected spam for %s", row) spam.add(str(row['content_id'])) else: not_spam.add(str(row['content_id'])) else: log.info("is_spam already computed for %s" % row) if updated > 0: c.is_spam_computed = True c.save() log.info("done classyfing crawl") execute_sql("""UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" % (c.id, c.id)) transaction.commit() log.info("dome processing %s", c) except (KeyError, KeyboardInterrupt, HttpError), e: log.error(e) transaction.rollback() pid.remove_pid() exit()
def last_crawlid(): return execute_sql("select crawl_id from hits_mv order by crawl_id desc limit 1;").fetchall()[0][0]
def update_crawl_agregates(commit_threshold=10, only_new=True): results = None if only_new: results = query_to_dicts( "select id from main_crawl p where old_id is null and not exists(select id from main_crawlagregates where crawl_id = p.id)" ) else: results = query_to_dicts( "select id from main_crawl p where not exists(select id from main_crawlagregates where crawl_id = p.id)" ) logging.info("got results") for i, row in enumerate(results): try: execute_sql( """ INSERT INTO main_crawlagregates SELECT sum(hits_available) as "hits", start_time, sum(reward * hits_available) as "reward", crawl_id, nextval('main_crawlagregates_id_seq'), count(*) as "count", 0 FROM (SELECT DISTINCT ON (group_id) * FROM hits_mv WHERE crawl_id = %s) AS p GROUP BY crawl_id, start_time """, row['id']) execute_sql("""UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" % (row['id'], row['id'])) print """UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" logging.info("update agregates for %s" % row['id']) if i % commit_threshold == 0: logging.info('commited after %s crawls' % i) execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) logging.error('an error occured at crawl_id: %s, %s %s' % (row['id'], error_info['type'], error_info['value'])) execute_sql('rollback;') # delete dummy data execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;") execute_sql("COMMIT;")
def update_crawl_agregates(commit_threshold=10, only_new = True): results = None if only_new: results = query_to_dicts("select id from main_crawl p where old_id is null and not exists(select id from main_crawlagregates where crawl_id = p.id)") else: results = query_to_dicts("select id from main_crawl p where not exists(select id from main_crawlagregates where crawl_id = p.id)") logging.info("got results") for i, row in enumerate(results): try: execute_sql(""" INSERT INTO main_crawlagregates SELECT sum(hits_available) as "hits", start_time, sum(reward * hits_available) as "reward", crawl_id, nextval('main_crawlagregates_id_seq'), count(*) as "count", 0 FROM (SELECT DISTINCT ON (group_id) * FROM hits_mv WHERE crawl_id = %s) AS p GROUP BY crawl_id, start_time """, row['id']) execute_sql("""UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" % (row['id'], row['id']) ) print """UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" logging.info("update agregates for %s" % row['id']) if i % commit_threshold == 0: logging.info( 'commited after %s crawls' % i ) execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) logging.error('an error occured at crawl_id: %s, %s %s' % (row['id'],error_info['type'], error_info['value'])) execute_sql('rollback;') # delete dummy data execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;") execute_sql("COMMIT;")
def hitgroups(cid): r = execute_sql("select distinct group_id from hits_mv where crawl_id = %s", cid) return [g[0] for g in r.fetchall()]
def hitgroups(cid): r = execute_sql( "select distinct group_id from hits_mv where crawl_id = %s", cid) return [g[0] for g in r.fetchall()]
def handle(self, **options): ''' Sample data: groupid|hit_title|requester_name|requester_id|description|keywords|qualifications|hit_expiration_date|time_allotted|reward|hits_available|time_crawled|crawl_id|pageNo|inpage_position|dollars CW18RQ8BPZWWMWY3DY4Z|Validate the addresses. Great bonus |Product Search|A28JEQTWH76JDT|Given the list of addresses and it's site URL, please verify whether the given addresses exists in the site or not. [ Note: Bonus and Reward Details: 1). For each validated address you will get $0.02 as bonus. Suppose if the input file contains 10 addresses and you have validated all of them then you will get $0.2(i.e., 10*0.02) as bonus + $0.01 as reward. ]|reward, online, shopping, web, quality, testing, relevance, search, engine, US, India, browse, hit |HIT approval rate (%) is not less than 80 |Aug 26, 2009 (33 weeks) |1 day 12 hours|$0.01|295|2009-01-07 14:00:05|1|2|5|0.01 HitGroupContent: group_id = models.CharField('Group ID', max_length=50, db_index=True, unique=True) group_id_hashed = models.BooleanField(default=False) requester_id = models.CharField('Requester ID', max_length=50, db_index=True) requester_name = models.CharField('Requester Name', max_length=500) reward = models.FloatField('Reward') html = models.TextField('HTML', max_length=100000000) description = models.TextField('Description', max_length=1000000) title = models.CharField('Title', max_length=500) keywords = models.CharField('Keywords', blank=True, max_length=500, null=True) qualifications = models.CharField('Qualifications', blank=True, max_length=500, null=True) occurrence_date = models.DateTimeField('First occurrence date', blank=True, null=True, db_index=True) time_alloted = models.IntegerField('Time alloted') first_crawl = models.ForeignKey(Crawl, blank=True, null=True) HitGroupStatus group_id = models.CharField('Group ID',max_length=50, db_index=True) hits_available = models.IntegerField('Hits Avaliable') page_number = models.IntegerField('Page Number') inpage_position = models.IntegerField('In Page Position') hit_expiration_date = models.DateTimeField('Hit expiration Date') hit_group_content = models.ForeignKey(HitGroupContent) crawl = models.ForeignKey(Crawl) ''' items_per_transaction = 1000 transaction_count = 0 i = 0 hit_group_content_mapping = {} crawl_mapping = {} print 'setting up crawl mappings' crawls = Crawl.objects.all().values_list('old_id','pk') for row in crawls: crawl_mapping[row[0]] = row[1] del crawls try: i = get_counter('import_hitgroups_line') except: pass try: f = open(os.path.join(settings.ROOT_PATH,'data','hits.utf8.csv'),"rb") error_log = open(os.path.join(settings.ROOT_PATH,'data','error.hits.utf8.csv'),'w') ''' seek to file_position stored in counter ''' try: last_position = get_counter('import_hitgroups') print 'starting from: %s' % last_position f.seek(int(last_position)) except: f.readline() print 'coulnd not find last position starting from first line' pass transaction.enter_transaction_management() transaction.managed(True) for row in f: try: row = row.strip() group_id, title, requster_name, requester_id, description, keywords, qualifications, hit_expiration_date, time_alloted, reward, hits_available, time_crawled, crawl_id, page_no, inpage_position, dollars = tuple(row.split('|')) ''' check if there already is a HitGroupContent for this row if HitGroupContent exists do nothin ''' reward = float(reward[1:]) # stripiing starting $ ex. $0.1 time_alloted = parse_time_alloted(time_alloted) # parsing strings like 4 hours 30 minutes to int minutes crawl_id = int(crawl_id) hits_available = int(hits_available) page_no = int(page_no) inpage_position = int(inpage_position) hashed_group_id = False if group_id == '': group_id = hashlib.md5("%s;%s;%s;%s;%s;%s;%s;" % (title, requester_id, time_alloted,reward, description,keywords, qualifications)).hexdigest() hashed_group_id = True hit_expiration_date = datetime.datetime.strptime(re.sub('\(.*\)', '', hit_expiration_date).strip(), "%b %d, %Y") # Apr 5, 2009 (4 weeks 1 day) exists = False content_id = execute_sql("select id from main_hitgroupcontent where group_id = '%s'" % group_id).fetchone() if content_id is not None: hit_group_content_mapping[group_id] = content_id[0] exists = True if not exists: ''' if not: save new HitGroupContent object store mapping in memmory ''' obj = HitGroupContent(group_id_hashed = hashed_group_id, group_id=group_id, requester_id = requester_id, requester_name = requster_name, reward = reward, description = description, title = title, keywords = keywords, qualifications = qualifications, time_alloted = time_alloted, occurrence_date = time_crawled ) obj.save() hit_group_content_mapping[group_id] = obj.pk ''' store hitgroupstatus into db with correct mapping to HitGroupContent ''' obj = HitGroupStatus(group_id = group_id, hits_available = hits_available, page_number = page_no, inpage_position = inpage_position, hit_expiration_date = hit_expiration_date, hit_group_content_id = hit_group_content_mapping[group_id], crawl_id = crawl_mapping[crawl_id]) obj.save() except (ValueError, KeyError): error_info = grab_error(sys.exc_info()) error_log.write(row) error_log.write("\r\n") print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value']) i += 1 ''' do a transaction per items_per_transaction rows when commiting transaction write file position and next crawl_id to counter file ''' if i % items_per_transaction == 0: transaction.commit() transaction_count += 1 write_counter('import_hitgroups', f.tell()) write_counter('import_hitgroups_line', i) print 'commited %s transaction, line: %s' % (transaction_count, i) except KeyboardInterrupt: ''' User stopped script, rollback last data, close file descriptors exit ''' transaction.rollback() error_log.close() f.close() exit()
def last_crawlids(limit=10): r = execute_sql( "select crawl_id from hits_mv order by crawl_id desc limit %s", limit) return [c[0] for c in r.fetchall()]
def last_crawlid(): return execute_sql( "select crawl_id from hits_mv order by crawl_id desc limit 1;" ).fetchall()[0][0]