def process_accounting(init, db, cursor, serviceid, service, debug):
    # - Process any waiting lines
    for record in sge.records(accounting=init['fh']):
        if init['record_num'] >= init['max_record']:

            record['service'] = service
            record['serviceid'] = serviceid
            record['record'] = init['record_num']
            record['job'] = str(record['job_number']) + "." + str(
                record['task_number'] or 1)

            if debug: print(record['job'], "record accounting")

            cursor.execute(init['add_record'], record)

            # Record job as requiring classification
            sge.sql_get_create(
                cursor,
                "SELECT * FROM jobs WHERE serviceid = %(serviceid)s AND job = %(job)s",
                {
                    'serviceid': serviceid,
                    'job': record['job'],
                    'classified': False,
                },
                insert=
                "INSERT INTO jobs (serviceid, job, classified) VALUES (%(serviceid)s, %(job)s, %(classified)s)",
                update=
                "UPDATE jobs SET classified=%(classified)s WHERE serviceid = %(serviceid)s AND job = %(job)s",
            )

            db.commit()

        init['record_num'] += 1
def init_syslogfile(cursor, serviceid, service, fname):
    # Determine number of old syslog records

    sql = sge.sql_get_create(
        cursor,
        "SELECT * FROM data_source_state WHERE serviceid = %s AND host = %s AND name = %s",
        (serviceid, socket.getfqdn(), fname),
        insert=
        "INSERT INTO data_source_state (serviceid, host, name) VALUES (%s, %s, %s)",
        first=True,
    )

    sys_max_record = sql['state']
    syslog.syslog("Found " + str(sys_max_record) + " old syslog " + \
                  service + " records")

    s_fh = open(fname)
    sys_record_num = 0

    return {
        'fh': s_fh,
        'max_record': sys_max_record,
        'record_num': sys_record_num,
        'fname': fname
    }
def sql_insert_host(cursor, serviceid, host):
    return (sge.sql_get_create(
        cursor,
        "SELECT id, name FROM hosts WHERE serviceid = %(serviceid)s AND name = %(name)s",
        {
            'serviceid': serviceid,
            'name': host,
        },
        insert=
        "INSERT INTO hosts (serviceid, name, name_sha1) VALUES (%(serviceid)s, %(name)s, SHA1(%(name)s))",
        first=True,
    ))
def main():
    # Command line arguments
    parser = argparse.ArgumentParser(description='Feed accounting data')
    parser.add_argument('--service',
                        action='store',
                        type=str,
                        help="Service name to tag records")
    parser.add_argument('--accountingfile',
                        action='store',
                        type=str,
                        help="Accounting file to read from")
    parser.add_argument('--syslogfile',
                        action='store',
                        type=str,
                        help="Syslog file to read from")
    parser.add_argument(
        '--sawrapdir',
        action='store',
        type=str,
        help="qstat3 sawrap dir to read node availability data from")
    parser.add_argument('--sleep',
                        action='store',
                        type=int,
                        default=300,
                        help="Time to sleep between loop trips")
    parser.add_argument('--credfile',
                        action='store',
                        type=str,
                        help="YAML credential file")
    parser.add_argument('--debug',
                        action='store_true',
                        default=False,
                        help="Print debugging messages")
    parser.add_argument('--pidfile',
                        action='store',
                        help="Store program PID in file")
    args = parser.parse_args()

    if not args.service:
        raise SystemExit("Error: provide a service name argument")

    if args.credfile:
        with open(args.credfile, 'r') as stream:
            credentials = yaml.safe_load(stream)
    else:
        raise SystemExit("Error: provide a database credential file")

    if args.pidfile:
        with open(args.pidfile, 'w') as stream:
            stream.write(str(os.getpid()))

    syslog.openlog()

    # Try connecting to database and processing records.
    # Retry after a delay if there's a failure.
    while True:
        if args.debug: print("Entering main loop")
        try:
            # Disconnect any previous session
            if 'db' in locals(): sge.dbtidy(db)

            # Connect to database
            db = mariadb.connect(**credentials)
            cursor = db.cursor(mariadb.cursors.DictCursor)

            # Get service id
            sql = sge.sql_get_create(
                cursor,
                "SELECT id FROM services WHERE name = %s",
                (args.service, ),
                insert="INSERT INTO services (name) VALUES (%s)",
                first=True,
            )
            serviceid = sql['id']
            db.commit()

            # Initialise state
            if args.accountingfile:
                i_account = init_accounting(cursor, serviceid, args.service,
                                            args.accountingfile)

            if args.syslogfile:
                i_syslog = init_syslogfile(cursor, serviceid, args.service,
                                           args.syslogfile)

            # Process records as they come in
            while True:
                # SGE accounting records
                if args.accountingfile:
                    process_accounting(i_account, db, cursor, serviceid,
                                       args.service, args.debug)

                # Syslog records
                if args.syslogfile:
                    process_syslogfile(i_syslog, db, cursor, serviceid,
                                       args.service, args.debug)

                # Node availability data
                if args.sawrapdir:
                    process_sawrapdir(args.sawrapdir, db, cursor, serviceid,
                                      args.debug)

                if args.debug: print("sleeping...")
                time.sleep(args.sleep)
        except:
            syslog.syslog("Processing failed" + str(sys.exc_info()))

        time.sleep(args.sleep)
def process_sawrapdir(dname, db, cursor, serviceid, debug):
    # Check we have all historical data
    for fname in os.listdir(dname):
        qstat3 = os.path.join(dname, fname)

        # Retrieve progress
        sql = sge.sql_get_create(
            cursor,
            "SELECT active,state FROM data_source_state WHERE serviceid = %s AND host = %s AND name = %s",
            (serviceid, socket.getfqdn(), qstat3),
            insert=
            "INSERT INTO data_source_state (serviceid, host, name) VALUES (%s, %s, %s)",
            first=True,
        )

        # Skip if file no longer active
        if not sql['active']: continue

        # Skip if nothing in file
        st = os.stat(qstat3)
        if not st.st_size > 0: continue

        if debug: print("Processing", qstat3)
        line_num = 0

        for line in sge.open_file(qstat3):
            line_num += 1
            if line_num <= sql['state']: continue

            r = re.match(
                r"""
               (?P<time>\d+)\s+
               (?P<queue>\S+)@
               (?P<host>\S+?)\.\S+\s+
               [BIPC]+\s+
               (?P<slots_reserved>\d+)/
               (?P<slots_used>\d+)/
               (?P<slots_total>\d+)\s+
               \S+\s+
               \S+\s+
               (?P<flags>\S+)?
            """,
                line,
                re.VERBOSE,
            )
            if r:
                d = r.groupdict()

                # Lookup relationships
                rec_q = sql_insert_queue(cursor, serviceid, d['queue'])
                rec_h = sql_insert_host(cursor, serviceid, d['host'])

                # Fill out status
                d['serviceid'] = serviceid
                d['queueid'] = rec_q['id']
                d['hostid'] = rec_h['id']
                d['ttl'] = 10 * 60  # 10 minutes by default
                d['enabled'] = True
                d['available'] = True
                if d['flags']:
                    d['enabled'] = "d" not in d['flags']
                    if re.match(r"[cdsuE]", d['flags']): d['available'] = False

                # Insert record if not already there
                sge.sql_get_create(
                    cursor,
                    "SELECT * FROM availability WHERE serviceid = %(serviceid)s AND time = %(time)s AND hostid = %(hostid)s AND queueid = %(queueid)s",
                    d,
                    insert=
                    "INSERT INTO availability (serviceid, time, hostid, queueid, slots_reserved, slots_used, slots_total, enabled, available, ttl) VALUES (%(serviceid)s, %(time)s, %(hostid)s, %(queueid)s, %(slots_reserved)s, %(slots_used)s, %(slots_total)s, %(enabled)s, %(available)s, %(ttl)s)",
                )
                db.commit()

        # If file is older than 3 days, mark as inactive
        # (to avoid reprocessing stuff all the time)
        st = os.stat(qstat3)
        active = True
        if time.time() - max([st.st_mtime, st.st_ctime]) > 3 * 24 * 3600:
            active = False

        # Record progress (lazy - do at end of file)
        cursor.execute(
            "UPDATE data_source_state SET active=%s,state=%s WHERE serviceid = %s AND host = %s AND name = %s",
            (active, line_num, serviceid, socket.getfqdn(), qstat3),
        )

        db.commit()
def process_syslogfile(init, db, cursor, serviceid, service, debug):
    # - Process any waiting lines
    for record in syslog_records(file=init['fh']):
        init['record_num'] += 1

        # Skip processed lines
        if init['record_num'] < init['max_record']: continue

        # Record line as processed
        cursor.execute(
            "UPDATE data_source_state SET state=%s WHERE serviceid = %s AND host = %s AND name = %s",
            (init['record_num'], serviceid, socket.getfqdn(), init['fname']),
        )

        # Allocate to service, flag as needing classification if
        # we update the record
        record['service'] = service
        record['serviceid'] = serviceid
        record['classified'] = False

        # Retrieve/create existing record
        sql = sge.sql_get_create(
            cursor,
            "SELECT * FROM jobs WHERE serviceid = %(serviceid)s AND job = %(job)s",
            record,
            insert=
            "INSERT INTO jobs (serviceid, job, classified) VALUES (%(serviceid)s, %(job)s, %(classified)s)",
            first=True,
        )

        # Update fields according to syslog data
        if record['type'] == "mpirun":

            # Get mpirun file record
            mpirun = sge.sql_get_create(
                cursor,
                "SELECT id, name FROM mpiruns WHERE name = %(name)s",
                {
                    'name': record['mpirun_file'],
                },
                insert=
                "INSERT INTO mpiruns (name, name_sha1) VALUES (%(name)s, SHA1(%(name)s))",
                first=True,
            )

            # Add mpirun file to job record if needed
            # Mark job as needing fresh classification
            sge.sql_get_create(
                cursor,
                "SELECT * FROM job_to_mpirun WHERE jobid = %(jobid)s AND mpirunid = %(mpirunid)s",
                {
                    'jobid': sql['id'],
                    'mpirunid': mpirun['id'],
                },
                insert=
                "INSERT INTO job_to_mpirun (jobid, mpirunid) VALUES (%(jobid)s, %(mpirunid)s)",
                oninsert=
                "UPDATE jobs SET classified=FALSE WHERE id = %(jobid)s",
            )

            if debug: print(record['job'], "mpirun", record['mpirun_file'])

        elif record['type'] == "sgealloc":
            if record['alloc']:
                hosts = sql['hosts']

                for alloc in record['alloc'].split(','):
                    r = re.match(r"([^@]+)@([^=]+)=(\d+)", alloc)
                    if r:
                        q = r.group(1)
                        h = r.group(2)
                        slots = r.group(3)
                        hosts += 1

                        # Get queue record
                        rec_q = sql_insert_queue(cursor, serviceid, q)

                        # Get host record
                        rec_h = sql_insert_host(cursor, serviceid, h)

                        # Add allocation to job record if needed
                        # Mark job as needing fresh classification
                        sge.sql_get_create(
                            cursor,
                            "SELECT * FROM job_to_alloc WHERE jobid = %(jobid)s AND hostid = %(hostid)s AND queueid = %(queueid)s",
                            {
                                'jobid': sql['id'],
                                'hostid': rec_h['id'],
                                'queueid': rec_q['id'],
                                'slots': slots,
                                'hosts': hosts,
                            },
                            insert=
                            "INSERT INTO job_to_alloc (jobid, hostid, queueid, slots) VALUES (%(jobid)s, %(hostid)s, %(queueid)s, %(slots)s)",
                            oninsert=
                            "UPDATE jobs SET classified=FALSE, hosts=%(hosts)s WHERE id = %(jobid)s",
                        )

                        if debug: print(record['job'], "update sgealloc")

        elif record['type'] == "sgenodes":
            if record['nodes_nodes']:
                if sql['nodes_nodes'] != int(record['nodes_nodes']) or \
                   sql['nodes_np'] != int(record['nodes_np']) or \
                   sql['nodes_ppn'] != int(record['nodes_ppn']) or \
                   sql['nodes_tpp'] != int(record['nodes_tpp']):

                    if debug: print(record['job'], "update sgenodes")
                    sql_update_job(
                        cursor,
                        "nodes_nodes=%(nodes_nodes)s, nodes_np=%(nodes_np)s, nodes_ppn=%(nodes_ppn)s, nodes_tpp=%(nodes_tpp)s",
                        record)

        elif record['type'] == "sgemodules" or \
             record['type'] == "module load":

            if record['modules']:
                for module in record['modules'].split(':'):
                    # Get module record
                    mod = sge.sql_get_create(
                        cursor,
                        "SELECT id, name FROM modules WHERE name = %(name)s",
                        {
                            'name': module,
                        },
                        insert=
                        "INSERT INTO modules (name, name_sha1) VALUES (%(name)s, SHA1(%(name)s))",
                        first=True,
                    )

                    # Add module file to job record if needed
                    # Mark job as needing fresh classification
                    sge.sql_get_create(
                        cursor,
                        "SELECT * FROM job_to_module WHERE jobid = %(jobid)s AND moduleid = %(moduleid)s",
                        {
                            'jobid': sql['id'],
                            'moduleid': mod['id'],
                        },
                        insert=
                        "INSERT INTO job_to_module (jobid, moduleid) VALUES (%(jobid)s, %(moduleid)s)",
                        oninsert=
                        "UPDATE jobs SET classified=FALSE WHERE id = %(jobid)s",
                    )

                if debug: print(record['job'], "module", record['modules'])

        elif record['type'] == "sge-allocator: Resource stats nvidia":

            # Get host record
            rec_h = sql_insert_host(cursor, serviceid, record['host'])

            # Get coproc record
            # (tag with hostname as coproc name is currently just a
            # index on a host. Not necessary if we started using the
            # card UUID instead)
            rec_cp = sge.sql_get_create(
                cursor,
                "SELECT id, name, model FROM coprocs WHERE name = %(name)s",
                {
                    'name': record['host'] + ":" + record['name'],
                    'model': record['model'],
                    'memory':
                    1024 * 1024 * int(record['coproc_max_mem']),  # bytes
                },
                insert=
                "INSERT INTO coprocs (name, name_sha1, model, model_sha1, memory) VALUES (%(name)s, SHA1(%(name)s), %(model)s, SHA1(%(model)s), %(memory)s)",
                first=True,
            )

            # Add to job record (and update coproc stats) if not seen this allocation before
            sge.sql_get_create(
                cursor,
                "SELECT jobid FROM job_to_coproc WHERE jobid = %(jobid)s AND hostid = %(hostid)s AND coprocid = %(coprocid)s",
                {
                    'jobid':
                    sql['id'],
                    'hostid':
                    rec_h['id'],
                    'coprocid':
                    rec_cp['id'],
                    'coproc':
                    sql['coproc'] + 1,
                    'coproc_max_mem':
                    1024 * 1024 * int(record['coproc_max_mem']),  # bytes
                    'coproc_cpu':
                    float(record['coproc_cpu']) / 100,  # s
                    'coproc_mem':
                    float(record['coproc_mem']) / (100 * 1024),  # Gib * s
                    'coproc_maxvmem':
                    1024 * 1024 * int(record['coproc_maxvmem']),  # bytes
                    'sum_coproc_max_mem':
                    sum([
                        1024 * 1024 * int(record['coproc_max_mem']),
                        sql['coproc_max_mem']
                    ]),  # bytes
                    'sum_coproc_cpu':
                    sum([float(record['coproc_cpu']) / 100, sql['coproc_cpu']
                         ]),  # s
                    'sum_coproc_mem':
                    sum([
                        float(record['coproc_mem']) /
                        (100 * 1024), sql['coproc_mem']
                    ]),  # Gib * s
                    'sum_coproc_maxvmem':
                    sum([
                        1024 * 1024 * int(record['coproc_maxvmem']),
                        sql['coproc_maxvmem']
                    ]),  # bytes
                },
                insert=
                "INSERT INTO job_to_coproc (jobid, hostid, coprocid, coproc_max_mem, coproc_cpu, coproc_mem, coproc_maxvmem) VALUES (%(jobid)s, %(hostid)s, %(coprocid)s, %(coproc_max_mem)s, %(coproc_cpu)s, %(coproc_mem)s, %(coproc_maxvmem)s)",
                oninsert=
                "UPDATE jobs SET classified=FALSE, coproc=%(coproc)s, coproc_max_mem=%(sum_coproc_max_mem)s, coproc_cpu=%(sum_coproc_cpu)s, coproc_mem=%(sum_coproc_mem)s, coproc_maxvmem=%(sum_coproc_maxvmem)s WHERE id = %(jobid)s",
            )

            if debug: print(record['job'], "update gpu stats")

        elif record['type'] == "sgeepilog":
            if record['epilog_copy']:
                if sql['epilog_copy'] != int(record['epilog_copy']):

                    if debug: print(record['job'], "update sgeepilog")
                    sql_update_job(cursor, "epilog_copy=%(epilog_copy)s",
                                   record)

        else:
            if debug: print("What the?", record['type'])

        db.commit()
Example #7
0
def main():
    # Command line arguments
    parser = argparse.ArgumentParser(description='Classify accounting data')
    parser.add_argument('--services',
                        action='store',
                        type=str,
                        help="Service names to process records for")
    parser.add_argument('--sleep',
                        action='store',
                        type=int,
                        default=300,
                        help="Time to sleep between loop trips")
    parser.add_argument('--credfile',
                        action='store',
                        type=str,
                        help="YAML credential file")
    parser.add_argument('--debug',
                        action='store_true',
                        default=False,
                        help="Print debugging messages")
    parser.add_argument('--limit',
                        action='store',
                        type=int,
                        default=1000,
                        help="Max number of records to classify at once")
    parser.add_argument('--reportmpi',
                        action='store_true',
                        default=False,
                        help="Report on mpirun exes we don't have regexes for")
    args = parser.parse_args()

    if args.credfile:
        with open(args.credfile, 'r') as stream:
            credentials = yaml.safe_load(stream)
    else:
        raise SystemExit("Error: provide a database credential file")

    if args.reportmpi:
        reportmpi(credentials)
        raise SystemExit

    if not args.services:
        raise SystemExit("Error: provide service name arguments")

    args.services = commasep_list(args.services)

    syslog.openlog()

    # Try connecting to database and processing records.
    # Retry after a delay if there's a failure.
    while True:
        if args.debug: print("Entering main loop")
        try:
            # Disconnect any previous session
            if 'db' in locals(): sge.dbtidy(db)

            # Connect to database
            db = mariadb.connect(**credentials)
            cursor = db.cursor(mariadb.cursors.DictCursor)

            while True:
                for service in args.services:

                    # Get service id
                    sql = sge.sql_get_create(
                        cursor,
                        "SELECT id,name FROM services WHERE name = %s",
                        (service, ),
                        insert="INSERT INTO services (name) VALUES (%s)",
                        first=True,
                    )
                    serviceid = sql['id']
                    db.commit()

                    # Search for unclassified records
                    while cursor.execute(
                            "SELECT * FROM jobs WHERE serviceid = %s AND classified=FALSE LIMIT %s",
                        (serviceid, args.limit)):

                        # Classify waiting records
                        for sql in cursor:
                            classify(db, sql, service, args.debug)

                        # Commit and obtain an up to date view of database state
                        db.commit()

                if args.debug: print("sleeping...")
                time.sleep(args.sleep)

                # Update view of database state
                db.rollback()
        except:
            syslog.syslog("Processing failed" + str(sys.exc_info()))

        time.sleep(args.sleep)