def test_real_escape_string(self):
     """dbquery - real escape string"""
     testcase_ok = "Programmer"
     testcase_injection = "' OR ''='"
     self.assertEqual(dbquery.real_escape_string(testcase_ok), testcase_ok)
     self.assertNotEqual(dbquery.real_escape_string(testcase_injection),
                         testcase_injection)
Beispiel #2
0
def create_collection_bibrec(table_name, coll_name, step_size=10000, max_size=-1):
    if table_name[0] != '_':
        raise Exception("By convention, temporary tables must begin with '_'. I don't want to give you tools to screw st important")
    
    create_stmt = dbquery.run_sql("SHOW CREATE TABLE bibrec")[0][1].replace('bibrec', dbquery.real_escape_string(table_name))
    dbquery.run_sql("DROP TABLE IF EXISTS `%s`" % dbquery.real_escape_string(table_name))
    dbquery.run_sql(create_stmt)
    
    # now retrieve the collection
    c = search_engine.get_collection_reclist(coll_name)
    # reverse sort it
    c = sorted(c, reverse=True)
    
    if len(c) < 0:
        sys.stderr.write("The collection %s is empty!\n" % coll_name)
    
    c = list(c)
    l = len(c)
    if max_size > 0:
        l = max_size
    i = 0
    sys.stderr.write("Copying bibrec data, patience please...\n")
    while i < l:
        dbquery.run_sql("INSERT INTO `%s` SELECT * FROM `bibrec` WHERE bibrec.id IN (%s)" % 
                             (dbquery.real_escape_string(table_name), ','.join(map(str, c[i:i+step_size]))))
        i = i + len(c[i:i+step_size])
        #sys.stderr.write("%s\n" % i)
        
    sys.stderr.write("Total number of records: %s Copied: %s\n" % (len(c), min(l, len(c))))
Beispiel #3
0
def create_collection_bibrec(table_name, coll_name, step_size=10000, maxsize=None):
    if table_name[0] != '_':
        raise Exception("By convention, temporary tables must begin with '_'. I don't want to give you tools to screw st important")
    
    create_stmt = dbquery.run_sql("SHOW CREATE TABLE bibrec")[0][1].replace('bibrec', dbquery.real_escape_string(table_name))
    dbquery.run_sql("DROP TABLE IF EXISTS `%s`" % dbquery.real_escape_string(table_name))
    dbquery.run_sql(create_stmt)
    
    print create_stmt

    #now retrieve the collection
    c = search_engine.get_collection_reclist(coll_name)
    if len(c) < 0:
        sys.stderr.write("The collection %s is empty!\n" % coll_name)
    else:
    	print 'collection has x recs:', len(c)
    
    c = list(c)
    l = len(c)
    i = 0
    sys.stderr.write("Copying bibrec data\n")
    while i < l:
        dbquery.run_sql("INSERT INTO `%s` SELECT * FROM `bibrec` WHERE bibrec.id IN (%s)" % 
                             (dbquery.real_escape_string(table_name), ','.join(map(str, c[i:i+step_size]))))
        i = i + step_size
        sys.stderr.write("%s\n" % i)

        if (maxsize and i > maxsize):
        	break
        
    sys.stderr.write("Total number of records: %s\n" % l)
Beispiel #4
0
def gc_tasks(verbose=False, statuses=None, since=None, tasks=None): # pylint: disable=W0613
    """Garbage collect the task queue."""
    if tasks is None:
        tasks = CFG_BIBSCHED_GC_TASKS_TO_REMOVE + CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE
    if since is None:
        since = '-%id' % CFG_BIBSCHED_GC_TASKS_OLDER_THAN
    if statuses is None:
        statuses = ['DONE']

    statuses = [status.upper() for status in statuses if status.upper() != 'RUNNING']

    date = get_datetime(since)

    status_query = 'status in (%s)' % ','.join([repr(real_escape_string(status)) for status in statuses])

    for task in tasks:
        if task in CFG_BIBSCHED_GC_TASKS_TO_REMOVE:
            res = run_sql("""DELETE FROM schTASK WHERE proc=%%s AND %s AND
                             runtime<%%s""" % status_query, (task, date))
            write_message('Deleted %s %s tasks (created before %s) with %s'
                                            % (res, task, date, status_query))
        elif task in CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE:
            run_sql("""INSERT INTO hstTASK(id,proc,host,user,
                       runtime,sleeptime,arguments,status,progress)
                       SELECT id,proc,host,user,
                       runtime,sleeptime,arguments,status,progress
                       FROM schTASK WHERE proc=%%s AND %s AND
                       runtime<%%s""" % status_query, (task, date))
            res = run_sql("""DELETE FROM schTASK WHERE proc=%%s AND %s AND
                             runtime<%%s""" % status_query, (task, date))
            write_message('Archived %s %s tasks (created before %s) with %s'
                                            % (res, task, date, status_query))
Beispiel #5
0
    def report_about_processes(status='RUNNING', since=None, tasks=None):
        """
        Helper function to report about processes with the given status.
        """
        if tasks is None:
            task_query = ''
        else:
            task_query = 'AND proc IN (%s)' % (
                ','.join([repr(real_escape_string(task)) for task in tasks]))
        if since is None:
            since_query = ''
        else:
            # We're not interested in future task
            if since.startswith('+') or since.startswith('-'):
                since = since[1:]
            since = '-' + since
            since_query = "AND runtime >= '%s'" % get_datetime(since)

        res = run_sql("""SELECT id, proc, runtime, status, priority, host,
                         sequenceid
                         FROM schTASK WHERE status=%%s %(task_query)s
                         %(since_query)s ORDER BY id ASC""" % {
                            'task_query': task_query,
                            'since_query' : since_query},
                    (status,))

        write_message("%s processes: %d" % (status, len(res)))
        for t in Task.from_resultset(res):
            write_message(' * %s' % t)
        return
Beispiel #6
0
        def basket_updater(basket):
            """"""
            (bskid, name, is_public, id_owner, date_modification) = basket
            try:
                int(bskid)
                int(id_owner)
            except:
                print "#####################"
                print "id basket:"
                print bskid
                print "id user"
                print id_owner
                print "#########################"

            return "(%i,'%s',%i,'%s')" % (int(bskid), real_escape_string(
                name), int(id_owner), real_escape_string(date_modification))
 def records_updater(record):
     (bskid, id_owner, id_record, order, date_modification) = record
     return "(%i,%i,%i,%i,'%s')" % (
         int(id_record),
         int(bskid),
         int(id_owner),
         int(order),
         real_escape_string(date_modification),
     )
        def basket_updater(basket):
            """"""
            (bskid, name, is_public, id_owner, date_modification) = basket
            try:
                int(bskid)
                int(id_owner)
            except:
                print "#####################"
                print "id basket:"
                print bskid
                print "id user"
                print id_owner
                print "#########################"

            return "(%i,'%s',%i,'%s')" % (int(bskid),
                                          real_escape_string(name),
                                          int(id_owner),
                                          real_escape_string(date_modification))
Beispiel #9
0
 def test_real_escape_string(self):
     """dbquery - real escape string"""
     testcase_ok = "Programmer"
     testcase_injection = "' OR ''='"
     self.assertEqual(dbquery.real_escape_string(testcase_ok), testcase_ok)
     self.assertNotEqual(dbquery.real_escape_string(testcase_injection), testcase_injection)
Beispiel #10
0
 def records_updater(record):
     (bskid, id_owner, id_record, order, date_modification) = record
     return "(%i,%i,%i,%i,'%s')" % (int(id_record), int(bskid),
                                    int(id_owner), int(order),
                                    real_escape_string(date_modification))
Beispiel #11
0
def get_recids_changes(last_recid, max_recs=10000, mod_date=None, table='bibrec'):
    """
    Retrieves the sets of records that were added/updated/deleted
    
    The time is selected according to some know recid, ie. 
    we retrieve the modification time of one record and look
    at those that are older.
    
        OR
        
    You can pass in the date that you are interested in, in the 
    format: YYYY-MM-DD HH:MM:SS
    
    added => bibrec.modification_date == bibrec.creation_date
    updated => bibrec.modification_date >= bibrec.creation_date
    deleted => bibrec.status == DELETED
    """
    table = dbquery.real_escape_string(table)
    search_op = '>'    
    if not mod_date:
        if last_recid == -1:
            l = list(dbquery.run_sql("SELECT modification_date FROM `%s` ORDER BY modification_date ASC LIMIT 1" % (table,)))
            mod_date = l[0][0].strftime(format="%Y-%m-%d %H:%M:%S")
            search_op = '>='
        else:
            # let's make sure we have a valid recid (or get the close valid one)
            l = list(dbquery.run_sql("SELECT id, modification_date FROM `" + table + "` WHERE id >= %s LIMIT 1", (last_recid,)))
            if not len(l):
                return
            last_recid = l[0][0]
            mod_date = l[0][1].strftime(format="%Y-%m-%d %H:%M:%S")
            
            # there is not api to get this (at least i haven't found it)
            #mod_date = search_engine.get_modification_date(last_recid, fmt="%Y-%m-%d %H:%i:%S")
            #if not mod_date:
            #    return
        
    modified_records = list(dbquery.run_sql("SELECT id,modification_date, creation_date FROM `" + table +
                    "` WHERE modification_date " + search_op + "\"%s\" ORDER BY modification_date ASC, id ASC LIMIT %s" %
                    (mod_date, max_recs )))
    
    #sys.stderr.write(str(("SELECT id,modification_date, creation_date FROM bibrec "
    #                "WHERE modification_date " + search_op + "\"%s\" ORDER BY modification_date ASC, id ASC LIMIT %s" %
    #                (mod_date, max_recs ))) + "\n")
    #print len(modified_records)
    
    if not len(modified_records):
        return
    
    added = []
    updated = []
    deleted = []
    
    dels = {}
    for x in list(dbquery.run_sql("""SELECT distinct(id_bibrec) FROM bibrec_bib98x WHERE 
        id_bibrec >= %s AND id_bibrec <= %s AND 
        id_bibxxx=(SELECT id FROM bib98x WHERE VALUE='%s')""" % (modified_records[0][0],
                                                          modified_records[-1][0],
                                                          'DELETED'))):
        dels[int(x[0])] = 1
    
    
    for recid, mod_date, create_date in modified_records:
        recid = int(recid)
        
        # this is AWFULLY slow! 100x times
        #status = search_engine.record_exists(recid)
        
        if recid in dels:
        #if status == -1:
            deleted.append(recid)
        elif mod_date == create_date:
            added.append(recid)
        else:
            updated.append(recid)
    
    return {'DELETED': deleted, 'UPDATED': updated, 'ADDED': added}, recid, str(mod_date)
Beispiel #12
0
def get_recids_changes(last_recid, max_recs=10000, mod_date=None, table='bibrec'):
    """
    Retrieves the sets of records that were added/updated/deleted
    
    The time is selected according to some know recid, ie. 
    we retrieve the modification time of one record and look
    at those that are older. 
    
        OR
        
    You can pass in the date that you are interested in, in the 
    format: YYYY-MM-DD HH:MM:SS
    
    added => bibrec.modification_date == bibrec.creation_date
    updated => bibrec.modification_date >= bibrec.creation_date
    deleted => bibrec.status == DELETED
    
    We usually return list of max_recs size, however if there 
    are records past max_recs size and their modification date 
    is the same, we'll add them too. This is a workaround and 
    necessary because Invenio doesn't use timestamp with high
    enough granularity
    """
    
    table = dbquery.real_escape_string(table)
    search_op = '>'    
    if not mod_date:
        if last_recid == -1:
            l = list(dbquery.run_sql("SELECT modification_date FROM `%s` ORDER BY modification_date ASC LIMIT 1" % (table,)))
            mod_date = l[0][0].strftime(format="%Y-%m-%d %H:%M:%S")
            search_op = '>='
        else:
            l = list(dbquery.run_sql("SELECT id, modification_date FROM `" + table + "` WHERE id = %s LIMIT 1", (last_recid,)))
            if not len(l):
                # let's make sure we have a valid recid (or get the close valid one) BUT this could mean we get the wrong
                # modification date! (and could result in non-ending loop!)
                l = list(dbquery.run_sql("SELECT id, modification_date FROM `" + table + "` WHERE id >= %s LIMIT 1", (last_recid,)))
                if len(l):
                    sys.stderr.write("Warning, the last_recid %s does not exist, we have found the closest higher id %s with mod_date: %s\n" %
                           (l[0][0], l[0][1].strftime(format="%Y-%m-%d %H:%M:%S")))
            if not len(l):
                return
            last_recid = l[0][0]
            mod_date = l[0][1].strftime(format="%Y-%m-%d %H:%M:%S")
            
            # there is not api to get this (at least i haven't found it)
            #mod_date = search_engine.get_modification_date(last_recid, fmt="%Y-%m-%d %H:%i:%S")
            #if not mod_date:
            #    return
        
    modified_records = list(dbquery.run_sql("SELECT id,modification_date, creation_date FROM `" + table +
                    "` WHERE modification_date " + search_op + " '%s' ORDER BY modification_date ASC, id ASC LIMIT %s" %
                    (mod_date, max_recs )))
    
    #sys.stderr.write(str(("SELECT id,modification_date, creation_date FROM bibrec "
    #                "WHERE modification_date " + search_op + "\"%s\" ORDER BY modification_date ASC, id ASC LIMIT %s" %
    #                (mod_date, max_recs ))) + "\n")
    #print len(modified_records)
    
    if not len(modified_records):
        return
    
    # because invenio understands regularity of only one sec (which is very stupid) we must make sure
    # that we include all records that were modified in that one second to close the group
    #if modified_records[-1][1].strftime(format="%Y-%m-%d %H:%M:%S") == mod_date:
    for to_add in list(dbquery.run_sql("SELECT id,modification_date, creation_date FROM `" + table +
                    "` WHERE modification_date = '%s' AND id > %s ORDER BY id ASC" %
                    (modified_records[-1][1].strftime(format="%Y-%m-%d %H:%M:%S"), modified_records[-1][0]))):
            modified_records.append(to_add)
            
    
    added = []
    updated = []
    deleted = []
    

    # find records that are marked as deleted, this can select lots of recs (in the worst case
    # all deleted recs) but usually it will work with a subrange
    rec_ids = sorted([r[0] for r in modified_records])
    local_min = rec_ids[0]
    local_max = rec_ids[-1]

    dels = {}
    for x in list(dbquery.run_sql("""SELECT distinct(id_bibrec) FROM bibrec_bib98x WHERE 
        (id_bibrec >= %s AND id_bibrec <= %s) AND 
        id_bibxxx=(SELECT id FROM bib98x WHERE VALUE='DELETED')""" % (local_min, local_max))):
        dels[int(x[0])] = 1
    #print "delets local min=%s max=%s found=%s" % (local_min, local_max, len(dels))
    
    
    for recid, mod_date, create_date in modified_records:
        recid = int(recid)
        
        # this is AWFULLY slow! 100x times
        #status = search_engine.record_exists(recid)
        
        if recid in dels:
        #if status == -1:
            deleted.append(recid)
        elif mod_date == create_date:
            added.append(recid)
        else:
            updated.append(recid)
    
    return {'DELETED': deleted, 'UPDATED': updated, 'ADDED': added}, recid, mod_date.strftime(format="%Y-%m-%d %H:%M:%S")