def test_real_escape_string(self): """dbquery - real escape string""" testcase_ok = "Programmer" testcase_injection = "' OR ''='" self.assertEqual(dbquery.real_escape_string(testcase_ok), testcase_ok) self.assertNotEqual(dbquery.real_escape_string(testcase_injection), testcase_injection)
def create_collection_bibrec(table_name, coll_name, step_size=10000, max_size=-1): if table_name[0] != '_': raise Exception("By convention, temporary tables must begin with '_'. I don't want to give you tools to screw st important") create_stmt = dbquery.run_sql("SHOW CREATE TABLE bibrec")[0][1].replace('bibrec', dbquery.real_escape_string(table_name)) dbquery.run_sql("DROP TABLE IF EXISTS `%s`" % dbquery.real_escape_string(table_name)) dbquery.run_sql(create_stmt) # now retrieve the collection c = search_engine.get_collection_reclist(coll_name) # reverse sort it c = sorted(c, reverse=True) if len(c) < 0: sys.stderr.write("The collection %s is empty!\n" % coll_name) c = list(c) l = len(c) if max_size > 0: l = max_size i = 0 sys.stderr.write("Copying bibrec data, patience please...\n") while i < l: dbquery.run_sql("INSERT INTO `%s` SELECT * FROM `bibrec` WHERE bibrec.id IN (%s)" % (dbquery.real_escape_string(table_name), ','.join(map(str, c[i:i+step_size])))) i = i + len(c[i:i+step_size]) #sys.stderr.write("%s\n" % i) sys.stderr.write("Total number of records: %s Copied: %s\n" % (len(c), min(l, len(c))))
def create_collection_bibrec(table_name, coll_name, step_size=10000, maxsize=None): if table_name[0] != '_': raise Exception("By convention, temporary tables must begin with '_'. I don't want to give you tools to screw st important") create_stmt = dbquery.run_sql("SHOW CREATE TABLE bibrec")[0][1].replace('bibrec', dbquery.real_escape_string(table_name)) dbquery.run_sql("DROP TABLE IF EXISTS `%s`" % dbquery.real_escape_string(table_name)) dbquery.run_sql(create_stmt) print create_stmt #now retrieve the collection c = search_engine.get_collection_reclist(coll_name) if len(c) < 0: sys.stderr.write("The collection %s is empty!\n" % coll_name) else: print 'collection has x recs:', len(c) c = list(c) l = len(c) i = 0 sys.stderr.write("Copying bibrec data\n") while i < l: dbquery.run_sql("INSERT INTO `%s` SELECT * FROM `bibrec` WHERE bibrec.id IN (%s)" % (dbquery.real_escape_string(table_name), ','.join(map(str, c[i:i+step_size])))) i = i + step_size sys.stderr.write("%s\n" % i) if (maxsize and i > maxsize): break sys.stderr.write("Total number of records: %s\n" % l)
def gc_tasks(verbose=False, statuses=None, since=None, tasks=None): # pylint: disable=W0613 """Garbage collect the task queue.""" if tasks is None: tasks = CFG_BIBSCHED_GC_TASKS_TO_REMOVE + CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE if since is None: since = '-%id' % CFG_BIBSCHED_GC_TASKS_OLDER_THAN if statuses is None: statuses = ['DONE'] statuses = [status.upper() for status in statuses if status.upper() != 'RUNNING'] date = get_datetime(since) status_query = 'status in (%s)' % ','.join([repr(real_escape_string(status)) for status in statuses]) for task in tasks: if task in CFG_BIBSCHED_GC_TASKS_TO_REMOVE: res = run_sql("""DELETE FROM schTASK WHERE proc=%%s AND %s AND runtime<%%s""" % status_query, (task, date)) write_message('Deleted %s %s tasks (created before %s) with %s' % (res, task, date, status_query)) elif task in CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE: run_sql("""INSERT INTO hstTASK(id,proc,host,user, runtime,sleeptime,arguments,status,progress) SELECT id,proc,host,user, runtime,sleeptime,arguments,status,progress FROM schTASK WHERE proc=%%s AND %s AND runtime<%%s""" % status_query, (task, date)) res = run_sql("""DELETE FROM schTASK WHERE proc=%%s AND %s AND runtime<%%s""" % status_query, (task, date)) write_message('Archived %s %s tasks (created before %s) with %s' % (res, task, date, status_query))
def report_about_processes(status='RUNNING', since=None, tasks=None): """ Helper function to report about processes with the given status. """ if tasks is None: task_query = '' else: task_query = 'AND proc IN (%s)' % ( ','.join([repr(real_escape_string(task)) for task in tasks])) if since is None: since_query = '' else: # We're not interested in future task if since.startswith('+') or since.startswith('-'): since = since[1:] since = '-' + since since_query = "AND runtime >= '%s'" % get_datetime(since) res = run_sql("""SELECT id, proc, runtime, status, priority, host, sequenceid FROM schTASK WHERE status=%%s %(task_query)s %(since_query)s ORDER BY id ASC""" % { 'task_query': task_query, 'since_query' : since_query}, (status,)) write_message("%s processes: %d" % (status, len(res))) for t in Task.from_resultset(res): write_message(' * %s' % t) return
def basket_updater(basket): """""" (bskid, name, is_public, id_owner, date_modification) = basket try: int(bskid) int(id_owner) except: print "#####################" print "id basket:" print bskid print "id user" print id_owner print "#########################" return "(%i,'%s',%i,'%s')" % (int(bskid), real_escape_string( name), int(id_owner), real_escape_string(date_modification))
def records_updater(record): (bskid, id_owner, id_record, order, date_modification) = record return "(%i,%i,%i,%i,'%s')" % ( int(id_record), int(bskid), int(id_owner), int(order), real_escape_string(date_modification), )
def basket_updater(basket): """""" (bskid, name, is_public, id_owner, date_modification) = basket try: int(bskid) int(id_owner) except: print "#####################" print "id basket:" print bskid print "id user" print id_owner print "#########################" return "(%i,'%s',%i,'%s')" % (int(bskid), real_escape_string(name), int(id_owner), real_escape_string(date_modification))
def records_updater(record): (bskid, id_owner, id_record, order, date_modification) = record return "(%i,%i,%i,%i,'%s')" % (int(id_record), int(bskid), int(id_owner), int(order), real_escape_string(date_modification))
def get_recids_changes(last_recid, max_recs=10000, mod_date=None, table='bibrec'): """ Retrieves the sets of records that were added/updated/deleted The time is selected according to some know recid, ie. we retrieve the modification time of one record and look at those that are older. OR You can pass in the date that you are interested in, in the format: YYYY-MM-DD HH:MM:SS added => bibrec.modification_date == bibrec.creation_date updated => bibrec.modification_date >= bibrec.creation_date deleted => bibrec.status == DELETED """ table = dbquery.real_escape_string(table) search_op = '>' if not mod_date: if last_recid == -1: l = list(dbquery.run_sql("SELECT modification_date FROM `%s` ORDER BY modification_date ASC LIMIT 1" % (table,))) mod_date = l[0][0].strftime(format="%Y-%m-%d %H:%M:%S") search_op = '>=' else: # let's make sure we have a valid recid (or get the close valid one) l = list(dbquery.run_sql("SELECT id, modification_date FROM `" + table + "` WHERE id >= %s LIMIT 1", (last_recid,))) if not len(l): return last_recid = l[0][0] mod_date = l[0][1].strftime(format="%Y-%m-%d %H:%M:%S") # there is not api to get this (at least i haven't found it) #mod_date = search_engine.get_modification_date(last_recid, fmt="%Y-%m-%d %H:%i:%S") #if not mod_date: # return modified_records = list(dbquery.run_sql("SELECT id,modification_date, creation_date FROM `" + table + "` WHERE modification_date " + search_op + "\"%s\" ORDER BY modification_date ASC, id ASC LIMIT %s" % (mod_date, max_recs ))) #sys.stderr.write(str(("SELECT id,modification_date, creation_date FROM bibrec " # "WHERE modification_date " + search_op + "\"%s\" ORDER BY modification_date ASC, id ASC LIMIT %s" % # (mod_date, max_recs ))) + "\n") #print len(modified_records) if not len(modified_records): return added = [] updated = [] deleted = [] dels = {} for x in list(dbquery.run_sql("""SELECT distinct(id_bibrec) FROM bibrec_bib98x WHERE id_bibrec >= %s AND id_bibrec <= %s AND id_bibxxx=(SELECT id FROM bib98x WHERE VALUE='%s')""" % (modified_records[0][0], modified_records[-1][0], 'DELETED'))): dels[int(x[0])] = 1 for recid, mod_date, create_date in modified_records: recid = int(recid) # this is AWFULLY slow! 100x times #status = search_engine.record_exists(recid) if recid in dels: #if status == -1: deleted.append(recid) elif mod_date == create_date: added.append(recid) else: updated.append(recid) return {'DELETED': deleted, 'UPDATED': updated, 'ADDED': added}, recid, str(mod_date)
def get_recids_changes(last_recid, max_recs=10000, mod_date=None, table='bibrec'): """ Retrieves the sets of records that were added/updated/deleted The time is selected according to some know recid, ie. we retrieve the modification time of one record and look at those that are older. OR You can pass in the date that you are interested in, in the format: YYYY-MM-DD HH:MM:SS added => bibrec.modification_date == bibrec.creation_date updated => bibrec.modification_date >= bibrec.creation_date deleted => bibrec.status == DELETED We usually return list of max_recs size, however if there are records past max_recs size and their modification date is the same, we'll add them too. This is a workaround and necessary because Invenio doesn't use timestamp with high enough granularity """ table = dbquery.real_escape_string(table) search_op = '>' if not mod_date: if last_recid == -1: l = list(dbquery.run_sql("SELECT modification_date FROM `%s` ORDER BY modification_date ASC LIMIT 1" % (table,))) mod_date = l[0][0].strftime(format="%Y-%m-%d %H:%M:%S") search_op = '>=' else: l = list(dbquery.run_sql("SELECT id, modification_date FROM `" + table + "` WHERE id = %s LIMIT 1", (last_recid,))) if not len(l): # let's make sure we have a valid recid (or get the close valid one) BUT this could mean we get the wrong # modification date! (and could result in non-ending loop!) l = list(dbquery.run_sql("SELECT id, modification_date FROM `" + table + "` WHERE id >= %s LIMIT 1", (last_recid,))) if len(l): sys.stderr.write("Warning, the last_recid %s does not exist, we have found the closest higher id %s with mod_date: %s\n" % (l[0][0], l[0][1].strftime(format="%Y-%m-%d %H:%M:%S"))) if not len(l): return last_recid = l[0][0] mod_date = l[0][1].strftime(format="%Y-%m-%d %H:%M:%S") # there is not api to get this (at least i haven't found it) #mod_date = search_engine.get_modification_date(last_recid, fmt="%Y-%m-%d %H:%i:%S") #if not mod_date: # return modified_records = list(dbquery.run_sql("SELECT id,modification_date, creation_date FROM `" + table + "` WHERE modification_date " + search_op + " '%s' ORDER BY modification_date ASC, id ASC LIMIT %s" % (mod_date, max_recs ))) #sys.stderr.write(str(("SELECT id,modification_date, creation_date FROM bibrec " # "WHERE modification_date " + search_op + "\"%s\" ORDER BY modification_date ASC, id ASC LIMIT %s" % # (mod_date, max_recs ))) + "\n") #print len(modified_records) if not len(modified_records): return # because invenio understands regularity of only one sec (which is very stupid) we must make sure # that we include all records that were modified in that one second to close the group #if modified_records[-1][1].strftime(format="%Y-%m-%d %H:%M:%S") == mod_date: for to_add in list(dbquery.run_sql("SELECT id,modification_date, creation_date FROM `" + table + "` WHERE modification_date = '%s' AND id > %s ORDER BY id ASC" % (modified_records[-1][1].strftime(format="%Y-%m-%d %H:%M:%S"), modified_records[-1][0]))): modified_records.append(to_add) added = [] updated = [] deleted = [] # find records that are marked as deleted, this can select lots of recs (in the worst case # all deleted recs) but usually it will work with a subrange rec_ids = sorted([r[0] for r in modified_records]) local_min = rec_ids[0] local_max = rec_ids[-1] dels = {} for x in list(dbquery.run_sql("""SELECT distinct(id_bibrec) FROM bibrec_bib98x WHERE (id_bibrec >= %s AND id_bibrec <= %s) AND id_bibxxx=(SELECT id FROM bib98x WHERE VALUE='DELETED')""" % (local_min, local_max))): dels[int(x[0])] = 1 #print "delets local min=%s max=%s found=%s" % (local_min, local_max, len(dels)) for recid, mod_date, create_date in modified_records: recid = int(recid) # this is AWFULLY slow! 100x times #status = search_engine.record_exists(recid) if recid in dels: #if status == -1: deleted.append(recid) elif mod_date == create_date: added.append(recid) else: updated.append(recid) return {'DELETED': deleted, 'UPDATED': updated, 'ADDED': added}, recid, mod_date.strftime(format="%Y-%m-%d %H:%M:%S")