async def submit(self, ctx, paste_url): errors = "No Errors" sqlLabelID = "SELECT ID FROM `labels_submitted` WHERE Label = %s" sqlPlayerID = "SELECT ID FROM `players_submitted` WHERE Players = %s" sqlLabelInsert = "INSERT IGNORE `labels_submitted`(`Label`) VALUES (%s)" sqlPlayersInsert = "INSERT IGNORE `players_submitted`(`Players`) VALUES (%s)" sqlInsertPlayerLabel = "INSERT IGNORE `playerlabels_submitted`(`Player_ID`, `Label_ID`) VALUES (%s, %s)" try: paste_soup = sql.get_paste_data(paste_url) List = sql.get_paste_names(paste_soup) labelCheck = sql.get_paste_label(paste_soup) sql.execute_sql(sqlLabelInsert, insert=True, param=[labelCheck]) sql.InsertPlayers(sqlPlayersInsert, List) dfLabelID = pd.DataFrame(sql.execute_sql(sqlLabelID, insert=False, param=[labelCheck])) playerID = sql.PlayerID(sqlPlayerID, List) sql.InsertPlayerLabel(sqlInsertPlayerLabel, playerID, dfLabelID) await ctx.reply("Your list has been received. Thank you!") except Exception as e: errors = str(e) await ctx.reply("There was an error parsing your list submission.") msg = cleandoc(f"""```diff Paste Information Submitted _____________________ Link: {paste_url} Errors: {errors} ```""") recipient = self.bot.get_user(int(os.getenv('SUBMIT_RECIPIENT'))) await recipient.send(msg)
def create_hits_mv_record(start_time, crawl_id): """Aggregates main_hitgroupstatus/main_hitgroupcontent data into hits_mv and marks crawl as already processed.""" execute_sql("""INSERT INTO hits_mv (status_id, content_id, group_id, crawl_id, start_time, requester_id, hits_available, page_number, inpage_position, hit_expiration_date, reward, time_alloted, hits_diff, is_spam) SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id, TIMESTAMP '{start_time}', q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted, null, q.is_spam FROM main_hitgroupstatus p JOIN main_hitgroupcontent q ON (q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id) WHERE p.crawl_id = {crawl_id}; """.format(start_time=start_time, crawl_id=crawl_id), commit=True) execute_sql(("UPDATE main_crawl SET has_hits_mv = true WHERE id = {0};" ).format(crawl_id), commit=True) transaction.commit_unless_managed()
def delete_crawl_agregates(self, ids): """This will be done separately, as it'a a much faster query with immediately visible effect. """ qq = self.__get_delete_queries(['main_crawlagregates'], 'in').next() execute_sql(qq.format(self.__chunk_str(ids))) transaction.commit_unless_managed()
def update_mviews(clear_existing=True, force=False, start=None, end=None): """Creates hits_mv records for crawls with enough groups_downloaded taking data from main_hitgroupstatus and main_hitgroupcontent tables. Filtering out incomplete anonymous crawls ----------------------------------------- Note: this is covered by the latter case There is a 20 page limit for anonymous mturk.com users, each page having 10 hitgroups, while usual number of hitgroups is at least 100. Thus all crawls with 200 or less crawls should be excluded. Filtering erronous crawls ------------------------- Usually, the difference between hitgroups_downloaded and higroups_available exceeding 10 percent denotes a crawl error. Such crawls should be excluded from creating hits_mv and further table records. """ for crawl_id, start_time in get_crawls_for_update(force, start, end): if clear_existing: log.info("Deleting hits_mv records for: {0}".format(crawl_id)) execute_sql("DELETE FROM hits_mv WHERE crawl_id = {0}".format( crawl_id), commit=True) log.info("Creating hits_mv records for: {0}.".format(crawl_id)) create_hits_mv_record(start_time, crawl_id)
def update_diffs(limit=100): start_time = time.time() execute_sql(""" UPDATE hits_mv SET hits_diff = diffs.hits_diff FROM (SELECT group_id, crawl_id, hits_available - COALESCE(lag(hits_available) over (partition BY group_id ORDER BY crawl_id), 0) AS hits_diff FROM hits_mv WHERE hits_diff is NULL AND crawl_id in (SELECT DISTINCT crawl_id FROM hits_mv WHERE hits_diff is NULL ORDER BY crawl_id DESC LIMIT %s) ) AS diffs WHERE (diffs.group_id = hits_mv.group_id) AND (diffs.crawl_id = hits_mv.crawl_id);""", (int(limit), )) execute_sql('commit;') log.debug('Updated diffs for %s crawls in %s\n%s', limit, (time.time() - start_time))
def update_first_occured_agregates(): missing_crawls = query_to_tuples("""select id from main_crawl p where p.success = true and not exists (select crawl_id from main_hitgroupfirstoccurences where crawl_id = p.id );""") for row in missing_crawls: crawl_id = row[0] log.info("inserting missing crawl into main_hitgroupfirstoccurences: %s" % crawl_id) execute_sql("""INSERT INTO main_hitgroupfirstoccurences (reward, group_content_id, crawl_id, requester_name, group_status_id, occurrence_date, requester_id, group_id, hits_available, id) select p.reward, p.id, q.crawl_id, p.requester_name, q.id, p.occurrence_date, p.requester_id, p.group_id, q.hits_available, nextval('main_hitgroupfirstoccurences_id_seq'::regclass) from main_hitgroupcontent p join main_hitgroupstatus q on( p.first_crawl_id = q.crawl_id and q.hit_group_content_id = p.id ) where q.crawl_id = %s;""" % crawl_id) execute_sql('commit;')
def handle(self, *args, **options): def _to_hit_group_class(results): for result in results: doc = result['document'] prob = result['probabilities'] yield HitGroupClass(group_id=doc['group_id'], classes=NaiveBayesClassifier.most_likely(result), probabilities=json.dumps(prob)) if options['clear_all']: logger.info('Removing all existing classification') # HitGroupClass.objects.all().delete() execute_sql('DELETE FROM main_hitgroupclass;', commit=True) return if options['begin'] and options['end']: # XXX it can be slow. query = ''' SELECT group_id, title, description, keywords FROM main_hitgroupcontent as content JOIN hits_mv ON content.group_id = hits_mv.group_id WHERE NOT EXISTS( SELECT * FROM main_hitgroupclass as class WHERE content.group_id = class.group_id ) AND hits_mv.start_time >= {} AND hits_mv.start_time < {} GROUP BY group_id LIMIT {}; '''.format(options['begin'], options['end'], self.BATCH_SIZE) else: query = ''' SELECT group_id, title, description, keywords FROM main_hitgroupcontent as content WHERE NOT EXISTS( SELECT * FROM main_hitgroupclass as class WHERE content.group_id = class.group_id ) LIMIT {}; '''.format(self.BATCH_SIZE) if not options['classifier_path']: try: options['classifier_path'] = settings.CLASSIFIER_PATH except AttributeError: raise ImproperlyConfigured('Classifier path is not specified ' 'neither in the settings file' 'nor in the command line') with open(options['classifier_path'], 'r') as file: probabilities = json.load(file) classifier = NaiveBayesClassifier(probabilities=probabilities) logger.info('Classification of hit groups started. Processing in '\ 'batches size of {}'.format(self.BATCH_SIZE)) while True: models = query_to_dicts(query) logger.info('Batch classification started') try: results = _to_hit_group_class(classifier.classify_batch(models)) HitGroupClass.objects.bulk_create(results) except EmptyBatchException: logger.info('Batch is empty no hit groups to classify') break logger.info('Batch classified successfully')
def clear_existing_rows(start, end): t = time.time() log.info("Deleting crawl agregates for period {0} to {1}".format(start, end)) execute_sql( "DELETE FROM main_crawlagregates WHERE start_time BETWEEN '{0}'" " and '{1}'".format(start.isoformat(), end.isoformat()), commit=True, ) log.info("Deleting crawl agregates for period {0} to {1} took {2}s.".format(start, end, t))
def __create_extra_tables(): """No automation here, just add lines for extra tables wanted.""" for table_name, table_sql in EXTRA_TABLES.iteritems(): path = os.path.join(SQL_PATH, table_sql) if table_exists(table_name): log.warning('Table {0} exists.'.format(table_name)) else: execute_sql(open(path).read()) log.info('Creating table {0}.'.format(table_name)) transaction.commit_unless_managed()
def update_interrupted_crawl_stats(self): """Checks for crawls that were interrupted before updating the number of downloaded hit groups. Currently those crawls can be found by checking for crawls having groups_downloaded = groups_available, since at the begining of the crawl those values are set to be equal. """ execute_sql(self.recount_query) transaction.commit_unless_managed()
def clear_past_results(self, start, end): """Removes records from hits_mv and main_crawlaggregates tables matching the given time period. """ for table in ['hits_mv', 'main_crawlagregates']: st = time.time() self.log.info('Deleting rows from {0} where start_time between {1}' ' and {2}.'.format(table, start, end)) q = "DELETE FROM {0} where start_time BETWEEN '{1}' AND '{2}';" execute_sql(q.format(table, start, end), commit=True) self.log.info('{0}s elapsed.'.format(time.time() - st))
def update_interrupted_crawl_stats(self): """Checks for crawls that were interrupted before updating the number of downloaded hit groups. Currently those crawls can be found by checking for crawls having groups_downloaded = groups_available, since at the begining of the crawl those values are set to be equal. """ log.info('--fix-interrupted specified, updating interrupted crawls to ' 'have groups_downloaded match the actual downloaded object count.') execute_sql(self.recount_query) transaction.commit_unless_managed()
def update_mviews(): """Creates hits_mv records for crawls with enough groups_downloaded taking data from main_hitgroupstatus and main_hitgroupcontent tables. Filtering out incomplete anonymous crawls ----------------------------------------- Note: this is covered by the latter case There is a 20 page limit for anonymous mturk.com users, each page having 10 hitgroups, while usual number of hitgroups is at least 100. Thus all crawls with 200 or less crawls should be excluded. Filtering erronous crawls ------------------------- Usually, the difference between hitgroups_downloaded and higroups_available exceeding 10 percent denotes a crawl error. Such crawls should be excluded from creating hits_mv and further table records. """ query = """ select id, start_time from main_crawl p where p.success = true and old_id is null and has_hits_mv = false and not exists (select id from main_crawlagregates where crawl_id = p.id) and p.groups_available * 0.9 < p.groups_downloaded order by id desc""" missing_crawls = query_to_tuples(query) for row in missing_crawls: crawl_id, start_time = row log.info("inserting missing crawl: %s" % crawl_id) execute_sql("delete from hits_mv where crawl_id = %s" % crawl_id) execute_sql("""INSERT INTO hits_mv (status_id, content_id, group_id, crawl_id, start_time, requester_id, hits_available, page_number, inpage_position, hit_expiration_date, reward, time_alloted, hits_diff, is_spam) SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id, TIMESTAMP '%s', q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted, null, q.is_spam FROM main_hitgroupstatus p JOIN main_hitgroupcontent q ON (q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id) WHERE p.crawl_id = %s; """ % (start_time, crawl_id)) execute_sql(("update main_crawl set has_hits_mv = true where" " id = %s") % crawl_id) execute_sql('commit;')
def updatehitgroup(g, cid): """Updates hits_mv.hits_available by subtracting current group's hits_available. """ prev = execute_sql("""select hits_available from hits_mv where crawl_id between %s and %s and group_id = '%s' order by crawl_id desc limit 1;""" % (cid - 100, cid - 1, g)).fetchall() prev = prev[0][0] if prev else 0 execute_sql("""update hits_mv set hits_diff = hits_available - %s where group_id = '%s' and crawl_id = %s;""" % (prev, g, cid))
def clear_past_results(self): """Clears results in hits_mv and main_crawlagregates tables. Tables cleared: hits_mv -- hits_posted and hits_consumed main_crawlagregates -- hits_consumed, hits_posted, hitgroups_posted, hitgroups_consumed """ self.log.info("Clearing existiting hits_mv columns.") clear_time = time.time() execute_sql( """UPDATE hits_mv SET hits_posted = 0, hits_consumed = 0 WHERE crawl_id IN ( SELECT id FROM main_crawl WHERE start_time BETWEEN '{0}' AND '{1}' ) AND hits_posted > 0 OR hits_consumed > 0; """.format( self.start.isoformat(), self.end.isoformat() ), commit=True, ).close() self.log.info("{0}s elapsed.".format(time.time() - clear_time)) self.log.info("Clearing existiting main_crawlagregates columns.") clear_time = time.time() execute_sql( """UPDATE main_crawlagregates SET hits_posted = 0, hits_consumed = 0, hitgroups_posted = 0, hitgroups_consumed = 0 WHERE crawl_id IN ( SELECT id FROM main_crawl WHERE start_time BETWEEN '{0}' AND '{1}' ) AND ( hits_posted > 0 OR hits_consumed > 0 OR hitgroups_consumed > 0 OR hitgroups_posted > 0); """.format( self.start.isoformat(), self.end.isoformat() ), commit=True, ).close() self.log.info("{0}s elapsed.".format(time.time() - clear_time))
def do_deletes_simple(self, ids): """Performs a query per crawl and per table.""" qs = list(self.__get_delete_queries(['hits_mv', 'hits_temp'], '=')) for i, crawl_id in enumerate(ids, start=1): if self.limit and i > self.limit: break for q in qs: execute_sql(q.format(crawl_id)) if i % 10 == 0: log.info(("{0}/{1} crawls processed, {2}s elapsed so far." ).format(i, self.crawl_count, self.time_elapsed())) transaction.commit_unless_managed() execute_sql(("update main_crawl set has_hits_mv = false where" " id = {0}").format(crawl_id)) transaction.commit_unless_managed() return i
def do_deletes_chunked(self, ids): """More complex version, does multiple crawls at a time.""" processed = 0 qs = list(self.__get_delete_queries(['hits_mv', 'hits_temp'], 'in')) for chunk in self.read_chunks( ids, limit=self.limit, chunk_size=self.chunk_size): chunk_str = self.__chunk_str(chunk) for q in qs: execute_sql(q.format(chunk_str)) execute_sql(("update main_crawl set has_hits_mv = false where" " id in {0}").format(chunk_str)) processed += len(chunk) log.info('Processed crawls: {0}, {1}/{2} in {3}s.'.format( chunk, processed, self.crawl_count, self.time_elapsed())) transaction.commit_unless_managed() return processed
def handle(self, **options): update_crawl_agregates(only_new=False) f = open(os.path.join(settings.ROOT_PATH, 'crawl.errors.csv'), "rb") progress = 10 execute_sql("update main_crawl set success = true where old_id is not null") for i, id in enumerate(f): id = id.strip() execute_sql("delete from main_crawlagregates where crawl_id = (select id from main_crawl where old_id = %s)" % id) execute_sql("update main_crawl set success = false where old_id = %s" % id) if i % progress == 0: print "processed %s rows" % i execute_sql("commit;") execute_sql("commit;")
def update_cid(cid): """Updates hits_diff on hits_mv record related to crawl with id equal to cid. """ st = time.time() count = 0 for i, g in enumerate(query_to_tuples("select distinct group_id from hits_mv where crawl_id = %s", cid)): g = g[0] if i == 0: log.info("processing %s, %s %s", i, cid, g) updatehitgroup(g, cid) count += 1 execute_sql("commit;") log.info("Updated crawl {0} in {1}. {2} groups processed".format( cid, time.time() - st, count)) return count
def handle(self, **options): try: commit_threshold = 10 results = query_to_dicts("select id from main_crawl p where not exists(select id from hits_mv where crawl_id = p.id)") for i, row in enumerate(results): execute_sql("""insert into hits_mv SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id, ( SELECT main_crawl.start_time FROM main_crawl WHERE main_crawl.id = p.crawl_id) AS start_time, q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted FROM main_hitgroupstatus p JOIN main_hitgroupcontent q ON q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id WHERE p.crawl_id = %s """ % row['id']) if i % commit_threshold == 0: print datetime.datetime.now(), 'commited after %s crawls' % i execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) print 'an error occured at: %s line, %s %s' % (i, error_info['type'], error_info['value']) execute_sql('rollback;')
def __create_procedures(): """Executes create or replace for each procedure mentioned in PROCEDURES_TO_CREATE. Each procedure is defined by a filename and a procedure creating function. Filename with extension stripped will be the new procedure's name and the file's content it's body. The create function should take procedure's name and the content of it's file. Simplest such function is create_no_args which simply wraps procedure code in a correct CREATE OR REPLACE FUNCTION statement. """ sqls = [] for prfile, method in PROCEDURES_TO_CREATE.iteritems(): prname = os.path.splitext(os.path.basename(prfile))[0] sqlpath = os.path.join(SQL_PATH, prfile) log.info('Creating procedure {prname} from file: {sqlpath}'.format( prname=prname, sqlpath=sqlpath)) sqls.append(method(prname, open(sqlpath).read())) execute_sql('\n'.join(sqls)) transaction.commit_unless_managed()
def handle_add_review(): data = json.loads(request.data.decode()) barcode_id = data['barcode_id'] barcode_reviews = execute_sql( f"SELECT reviews " f"FROM barcodes " f"WHERE id={barcode_id}", POSTGRES_CONNECTION_PARAMS)[0]['reviews'] barcode_reviews.append({ 'text': data['text'], 'email': data['email'], 'display_name': data['display_name'], }) execute_sql( f"UPDATE barcodes " f"SET reviews={Json(barcode_reviews)} " f"WHERE id={barcode_id}", POSTGRES_CONNECTION_PARAMS) response_dict = { 'status': 'ok', } response_json = json.dumps(response_dict, ensure_ascii=False) response = Response(response_json, content_type="application/json; charset=utf-8") return response
def handle_barcodes(): barcodes = execute_sql("SELECT * " "FROM barcodes " "ORDER BY id", POSTGRES_CONNECTION_PARAMS) barcodes_len = len(barcodes) for i in range(barcodes_len): barcodes[i]['code'] = binary_to_str(barcodes[i]['code']) barcodes[i]['text'] = binary_to_str(barcodes[i]['text']) barcodes[i]['email'] = binary_to_str(barcodes[i]['email']) barcodes[i]['display_name'] = binary_to_str( barcodes[i]['display_name']) response_dict = {'status': 'ok', 'barcodes': barcodes} response_json = json.dumps(response_dict, ensure_ascii=False) response = Response(response_json, content_type="application/json; charset=utf-8") return response
def avg_weekly_events_per_month_location(): sub_query = """ SELECT location.name, location.id, DATE_TRUNC('week', event.start) AS week, DATE_TRUNC('month', event.start) AS month, COUNT(event.uuid) AS week_count FROM api_event event INNER JOIN api_location location ON (event.location_id = location.id) GROUP BY location.name, week, month, location.id """ main_query = """ SELECT month, name, ROUND(AVG("week_count"), 2) as weekly_avg FROM (%s) AS sub_table GROUP BY month, name ORDER BY name """ % (sub_query,) return execute_sql(main_query)
def topreq_data_hits_posted_crawl_id(days): """Evaluates toprequesters by the number of hits posted in a period. Keyword arguments: days -- number of days to look back """ # select the first crawl considered by start_time firstcrawl = execute_sql(""" SELECT crawl_id FROM hits_mv WHERE start_time > %s ORDER BY start_time ASC LIMIT 1;""", datetime.date.today() - datetime.timedelta(int(days)) ).fetchall()[0][0] # We are only interested in records having hits_posted > 0, thus only such # records will appear on the list and max(start_time) should be available at # all times. return list(query_to_tuples(""" SELECT h.requester_id, h.requester_name, coalesce(count(distinct mv.group_id), 0) as "projects", coalesce(sum(mv.hits_posted), 0) as "hits", coalesce(sum(mv.hits_posted * h.reward), 0) as "reward", max(mv.start_time) as "last_posted" FROM main_hitgroupcontent h LEFT JOIN main_requesterprofile p ON h.requester_id = p.requester_id LEFT JOIN ( SELECT group_id, hits_posted, start_time FROM hits_mv WHERE crawl_id > {0} AND hits_posted > 0 ) mv ON h.group_id = mv.group_id WHERE coalesce(p.is_public, true) = true GROUP BY h.requester_id, h.requester_name ORDER BY reward desc;""".format(firstcrawl)))
def handle_add_barcode(): data = json.loads(request.data.decode()) print(data) code = str_to_binary(str(data['code'])) text = str_to_binary(data['text']) email = str_to_binary(data['email']) display_name = str_to_binary(data['display_name']) reviews = Json([]) barcode_id = execute_sql( f"INSERT INTO barcodes(code, text, email, display_name, reviews) " f"VALUES ({code}, {text}, {email}, {display_name}, {reviews}) " f"RETURNING id", POSTGRES_CONNECTION_PARAMS, )[0]['id'] response_dict = {'status': 'ok', 'barcode_id': barcode_id} response_json = json.dumps(response_dict, ensure_ascii=False) response = Response(response_json, content_type="application/json; charset=utf-8") return response
def update_crawl_agregates(commit_threshold=1000, start=None, end=None, clear_existing=False): """Creates main_crawlagregates records for hits_mv.""" def print_status(number, row_id): log.info("Commited after %s crawls, last id %s." % (number, row_id)) clear_existing and start and end and clear_existing_rows(start, end) i = 0 for i, row in enumerate(get_crawls(start=start, end=end)): try: execute_sql( """ INSERT INTO main_crawlagregates (hits, start_time, reward, crawl_id, id, projects, spam_projects) SELECT sum(hits_available) as "hits", start_time, sum(reward * hits_available) as "reward", crawl_id, nextval('main_crawlagregates_id_seq'), count(*) as "count", count(CASE WHEN is_spam = TRUE then TRUE ELSE NULL END) FROM (SELECT DISTINCT ON (group_id) * FROM hits_mv WHERE crawl_id = %s) AS p GROUP BY crawl_id, start_time """, row["id"], ) if i % commit_threshold == 0: print_status(i + 1, row["id"]) transaction.commit_unless_managed() except: error_info = grab_error(sys.exc_info()) log.error("an error occured at crawl_id: %s, %s %s" % (row["id"], error_info["type"], error_info["value"])) execute_sql("rollback;") if i % commit_threshold != 0: print_status(i + 1, row["id"]) transaction.commit_unless_managed() # delete dummy data execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;", commit=True) transaction.commit_unless_managed()
def most_popular_venue_per_category(): sub_query = """ SELECT category.uuid AS category_uuid, category.name AS category_name, venue.name AS venue_name, venue.uuid AS venue_uuid, COUNT(event.uuid) AS event_count, ROW_NUMBER () OVER ( PARTITION BY category.uuid ORDER BY COUNT(event.uuid) DESC ) AS rank_filter FROM api_venue venue INNER JOIN api_event event ON (venue.uuid = event.venue_id) INNER JOIN api_category category ON (category.uuid = event.category_id) GROUP BY category_uuid, category_name, venue_name, venue_uuid """ main_query = """ SELECT * FROM (%s) AS sub_table WHERE "rank_filter" = 1 """ % (sub_query,) return execute_sql(main_query)
def calculate_first_crawl_id(): progress = 10 results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null") log.info('got missing ids results') for i, r in enumerate(results): log.info("\tprocessing %s" % r['id']) execute_sql("""update main_hitgroupcontent p set first_crawl_id = (select min(crawl_id) from main_hitgroupstatus where hit_group_content_id = p.id) where id = %s """ % r['id']) if i % progress == 0: execute_sql('commit;') log.info("updated %s main_hitgroupcontent rows with first_crawl_id" % i) execute_sql('commit;')
def handle(self, **options): try: commit_threshold = 1000 results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null") for i, row in enumerate(results): execute_sql("""update main_hitgroupcontent p set first_crawl_id = (select crawl_id from main_hitgroupstatus where group_id = p.group_id order by crawl_id asc LIMIT 1) where id = %s """ % row['id']) if i % commit_threshold == 0: print datetime.datetime.now(), 'commited after %s rows' % i execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) print 'an error occured at: %s line, %s %s' % (i, error_info['type'], error_info['value']) execute_sql('rollback;')
def clean_duplicates(): ids = query_to_dicts("select group_id from main_hitgroupcontent group by group_id having count(*) > 1;") for id in ids: log.info("Deleting duplicate group %s" % id['group_id']) execute_sql("""delete from main_hitgroupstatus where hit_group_content_id in ( select id from main_hitgroupcontent where id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s'); """ % (id['group_id'], id['group_id'])) execute_sql("""delete from main_hitgroupcontent where id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s' """ % (id['group_id'], id['group_id'])) execute_sql('commit;')
def handle(self, *args, **options): if options["clear_all"]: logger.info("Removing all existing classification aggregates") # HitGroupClassAggregate.objects.all().delete() execute_sql('DELETE FROM main_hitgroupclassaggregate;', commit=True) return begin = options["begin"] end = options["end"] # If the end of the time interval is not specified compute aggregates # up to now. if end is None: end = datetime.now() else: end = parse(end) # If the begining of the time interval is not specified compute # from last existing aggregates. if begin is None: aggregate = HitGroupClassAggregate.objects.aggregate(Max('start_time')) start_time = aggregate['start_time__max'] if start_time is None: aggregate = Crawl.objects.aggregate(Min('start_time')) start_time = aggregate['start_time__min'] + timedelta(seconds=1) else: start_time += timedelta(seconds=1) if start_time is not None: begin = start_time else: logger.warn("No crawls for processing") return else: begin = parse(begin) begin = begin.replace(tzinfo=None) end = end.replace(tzinfo=None) queryset = HitGroupClassAggregate.objects.filter(start_time__gte=begin, start_time__lt=end) if options["clear_existing"]: queryset.delete() if queryset.exists(): logger.error("Some of aggreagates are already calucated for a " "given time interval") return logger.info("Calculating hit group class aggregates from {} to {} " "with 1-hour intervals" .format(begin, end, )) chunk_begin = begin chunk_end = begin + timedelta(hours=1) query_template = """ INSERT INTO main_hitgroupclassaggregate ( crawl_id, start_time, classes, hits_available ) SELECT crawl_id, start_time, classes, sum(hits_available) as hits_available FROM hits_mv JOIN main_hitgroupclass ON hits_mv.group_id = main_hitgroupclass.group_id WHERE start_time >= '{}' AND start_time < '{}' GROUP BY crawl_id, start_time, classes; """ while (chunk_end < end): logger.info("Processing chunk of crawls from {} to {}" .format(chunk_begin, chunk_end)) query = query_template.format(chunk_begin, chunk_end) execute_sql(query, commit=True) chunk_begin = chunk_end chunk_end += timedelta(hours=1) if chunk_begin < end: query = query_template.format(chunk_begin, end) execute_sql(query, commit=True) logger.info("Hit group class aggregates are made")
def last_crawlids(limit=10): r = execute_sql("select crawl_id from hits_mv order by crawl_id desc limit %s", limit) return [c[0] for c in r.fetchall()]
def handle(self, **options): ''' Sample data: groupid | hit_title | requester_name | requester_id | description | keywords | qualifications | hit_expiration_date | time_allotted | reward | hits_available | time_crawled | crawl_id | pageNo | inpage_position | dollars CW18RQ8BPZWWMWY3DY4Z | Validate the addresses. Great bonus | Product Search | A28JEQTWH76JDT | Given the list of addresses and it's site URL, please verify whether the given addresses exists in the site or not. [ Note: Bonus and Reward Details: 1). For each validated address you will get $0.02 as bonus. Suppose if the input file contains 10 addresses and you have validated all of them then you will get $0.2(i.e., 10*0.02) as bonus + $0.01 as reward. ] | reward, online, shopping, web, quality, testing, relevance, search, engine, US, India, browse, hit | HIT approval rate (%) is not less than 80 | Aug 26, 2009 (33 weeks) | 1 day 12 hours | $0.01 | 295 | 2009-01-07 14:00:05 | 1 | 2 | 5 | 0.01 HitGroupContent: group_id = models.CharField('Group ID', max_length=50, db_index=True, unique=True) group_id_hashed = models.BooleanField(default=False) requester_id = models.CharField('Requester ID', max_length=50, db_index=True) requester_name = models.CharField('Requester Name', max_length=500) reward = models.FloatField('Reward') html = models.TextField('HTML', max_length=100000000) description = models.TextField('Description', max_length=1000000) title = models.CharField('Title', max_length=500) keywords = models.CharField('Keywords', blank=True, max_length=500, null=True) qualifications = models.CharField('Qualifications', blank=True, max_length=500, null=True) occurrence_date = models.DateTimeField('First occurrence date', blank=True, null=True, db_index=True) time_alloted = models.IntegerField('Time alloted') first_crawl = models.ForeignKey(Crawl, blank=True, null=True) HitGroupStatus group_id = models.CharField('Group ID',max_length=50, db_index=True) hits_available = models.IntegerField('Hits Avaliable') page_number = models.IntegerField('Page Number') inpage_position = models.IntegerField('In Page Position') hit_expiration_date = models.DateTimeField('Hit expiration Date') hit_group_content = models.ForeignKey(HitGroupContent) crawl = models.ForeignKey(Crawl) ''' items_per_transaction = 1000 transaction_count = 0 i = 0 hit_group_content_mapping = {} crawl_mapping = {} print 'setting up crawl mappings' crawls = Crawl.objects.all().values_list('old_id','pk') for row in crawls: crawl_mapping[row[0]] = row[1] del crawls try: i = get_counter('import_hitgroups_line') except: pass try: f = open(os.path.join(settings.ROOT_PATH,'data','hits.utf8.csv'),"rb") error_log = open(os.path.join(settings.ROOT_PATH,'data','error.hits.utf8.csv'),'w') ''' seek to file_position stored in counter ''' try: last_position = get_counter('import_hitgroups') print 'starting from: %s' % last_position f.seek(int(last_position)) except: f.readline() print 'coulnd not find last position starting from first line' pass transaction.enter_transaction_management() transaction.managed(True) for row in f: try: row = row.strip() group_id, title, requster_name, requester_id, description, keywords, qualifications, hit_expiration_date, time_alloted, reward, hits_available, time_crawled, crawl_id, page_no, inpage_position, dollars = tuple(row.split('|')) ''' check if there already is a HitGroupContent for this row if HitGroupContent exists do nothin ''' reward = float(reward[1:]) # stripiing starting $ ex. $0.1 time_alloted = parse_time_alloted(time_alloted) # parsing strings like 4 hours 30 minutes to int minutes crawl_id = int(crawl_id) hits_available = int(hits_available) page_no = int(page_no) inpage_position = int(inpage_position) hashed_group_id = False if group_id == '': group_id = hashlib.md5("%s;%s;%s;%s;%s;%s;%s;" % (title, requester_id, time_alloted,reward, description,keywords, qualifications)).hexdigest() hashed_group_id = True hit_expiration_date = datetime.datetime.strptime(re.sub('\(.*\)', '', hit_expiration_date).strip(), "%b %d, %Y") # Apr 5, 2009 (4 weeks 1 day) exists = False content_id = execute_sql("select id from main_hitgroupcontent where group_id = '%s'" % group_id).fetchone() if content_id is not None: hit_group_content_mapping[group_id] = content_id[0] exists = True if not exists: ''' if not: save new HitGroupContent object store mapping in memmory ''' obj = HitGroupContent(group_id_hashed = hashed_group_id, group_id=group_id, requester_id = requester_id, requester_name = requster_name, reward = reward, description = description, title = title, keywords = keywords, qualifications = qualifications, time_alloted = time_alloted, occurrence_date = time_crawled ) obj.save() hit_group_content_mapping[group_id] = obj.pk ''' store hitgroupstatus into db with correct mapping to HitGroupContent ''' obj = HitGroupStatus(group_id = group_id, hits_available = hits_available, page_number = page_no, inpage_position = inpage_position, hit_expiration_date = hit_expiration_date, hit_group_content_id = hit_group_content_mapping[group_id], crawl_id = crawl_mapping[crawl_id]) obj.save() except (ValueError, KeyError): error_info = grab_error(sys.exc_info()) error_log.write(row) error_log.write("\r\n") print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value']) i += 1 ''' do a transaction per items_per_transaction rows when commiting transaction write file position and next crawl_id to counter file ''' if i % items_per_transaction == 0: transaction.commit() transaction_count += 1 write_counter('import_hitgroups', f.tell()) write_counter('import_hitgroups_line', i) print 'commited %s transaction, line: %s' % (transaction_count, i) except KeyboardInterrupt: ''' User stopped script, rollback last data, close file descriptors exit ''' transaction.rollback() error_log.close() f.close() exit()
def last_crawlid(): return execute_sql("select crawl_id from hits_mv order by crawl_id desc limit 1;").fetchall()[0][0]
def hitgroups(cid): r = execute_sql("select distinct group_id from hits_mv where crawl_id = %s", cid) return [g[0] for g in r.fetchall()]