def runner_sqlschedules(options, dbdescr, experiment, *strings):
    """
    Schedule multiple jobs from the command line to run using the sql command.

    Usage: sqlschedules <tablepath> <experiment> <parameters>

    See the sqlschedule command for <tablepath> <experiment>
    We accept the dbidispatch syntax:
    where <parameters> is interpreted as follows:

      The parameters may contain one or many segments of the form
      {{a,b,c,d}}, which generate multiple jobs to execute. Each
      segement will be replaced by one value in the segment separated
      by comma. The first will have the a value, the second the b
      value, etc. If their is many segment, it will generate the
      cross-product of possible value between the segment.
    """
    parser = getattr(parse, options.parser, None) or resolve(options.parser)

    db = open_db(dbdescr, serial=True)

    ### resolve(experiment) # we try to load the function associated to the experiment

    verbose = not options.quiet

    (commands, choise_args) = generate_commands(strings)
    if verbose:
        print commands, choise_args

    if options.force:
        for cmd in commands:
            state = parser(*cmd)
            state['jobman.experiment'] = experiment
            sql.add_experiments_to_db([state] * (options.repeat),
                                      db,
                                      verbose=verbose,
                                      force_dup=True)
        if options.quiet:
            print "Added %d jobs to the db" % len(commands)
    else:
        #if the first insert fail, we won't force the other as the
        #force option was not gived.
        failed = 0
        for cmd in commands:
            state = parser(*cmd)
            state['jobman.experiment'] = experiment
            ret = sql.add_experiments_to_db([state],
                                            db,
                                            verbose=verbose,
                                            force_dup=options.force)
            if ret[0][0]:
                sql.add_experiments_to_db([state] * (options.repeat - 1),
                                          db,
                                          verbose=verbose,
                                          force_dup=True)
            else:
                failed += 1
                if verbose:
                    print "The last cmd failed to insert, we won't repeat it. use --force to force the duplicate of job in the db."
        print "Added", len(commands) - failed, "on", len(commands), "jobs"
Ejemplo n.º 2
0
Archivo: sql.py Proyecto: crmne/jobman
def db(dbstr):
    """ DEPRECATED: call api0.open_db(dbstr), which has the same api """
    import warnings
    warnings.warn(
        "sql.db is deprecated, call api0.open_db", DeprecationWarning)
    import api0
    return api0.open_db(dbstr)
Ejemplo n.º 3
0
def db(dbstr):
    """ DEPRECATED: call api0.open_db(dbstr), which has the same api """
    import warnings
    warnings.warn(
        "sql.db is deprecated, call api0.open_db", DeprecationWarning)
    import api0
    return api0.open_db(dbstr)
Ejemplo n.º 4
0
def runner_sqlschedules(options, dbdescr, experiment, *strings):
    """
    Schedule multiple jobs from the command line to run using the sql command.

    Usage: sqlschedules <tablepath> <experiment> <parameters>

    See the sqlschedule command for <tablepath> <experiment>
    We accept the dbidispatch syntax:
    where <parameters> is interpreted as follows:

      The parameters may contain one or many segments of the form
      {{a,b,c,d}}, which generate multiple jobs to execute. Each
      segement will be replaced by one value in the segment separated
      by comma. The first will have the a value, the second the b
      value, etc. If their is many segment, it will generate the
      cross-product of possible value between the segment.
    """
    parser = getattr(parse, options.parser, None) or resolve(options.parser)

    db = open_db(dbdescr, serial=True)

    # resolve(experiment) # we try to load the function associated to the
    # experiment

    verbose = not options.quiet

    (commands, choise_args) = generate_commands(strings)
    if verbose:
        print commands, choise_args

    if options.force:
        for cmd in commands:
            state = parser(*cmd)
            state['jobman.experiment'] = experiment
            sql.add_experiments_to_db([state] * (options.repeat),
                                      db, verbose=verbose, force_dup=True)
        if options.quiet:
            print "Added %d jobs to the db" % len(commands)
    else:
        # if the first insert fail, we won't force the other as the
        # force option was not gived.
        failed = 0
        for cmd in commands:
            state = parser(*cmd)
            state['jobman.experiment'] = experiment
            ret = sql.add_experiments_to_db([state], db,
                                            verbose=verbose,
                                            force_dup=options.force)
            if ret[0][0]:
                sql.add_experiments_to_db([state] * (options.repeat - 1), db,
                                          verbose=verbose, force_dup=True)
            else:
                failed += 1
                if verbose:
                    print "The last cmd failed to insert, we won't repeat it. use --force to force the duplicate of job in the db."
        print "Added", len(commands) - failed, "on", len(commands), "jobs"
Ejemplo n.º 5
0
def runner_sqlreload(options, dbdescr, table_dir, *ids):
    """
    Put data in the experiment directory back in the the sql db.

    Usefull in case you delete the db or part of it.

    Example use:

        jobman sqlreload [--all] postgres://user:pass@host[:port]/dbname?table=tablename ~/expdir/dbname/tablename 10 11
    """
    if table_dir[-1] == os.path.sep:
        table_dir = table_dir[:-1]

    db = open_db(dbdescr, serial=True)

    assert os.path.split(table_dir)[-1] == db.tablename
    assert os.path.split(os.path.split(table_dir)[0])[-1] == db.dbname
    expdir = os.path.split(os.path.split(table_dir)[0])[0]

    if options.all:
        assert len(ids) == 0
        ids = []
        for p in os.listdir(table_dir):
            try:
                ids += [int(p)]
            except ValueError:
                print 'Skipping entry %s, as it is not a jobman id.' % p
    else:
        # Ensure that ids are all integers.
        ids = [int(d) for d in ids]

    try:
        session = db.session()
        for id in ids:
            # Get state dict from the file
            file_name = '%s/%i/current.conf' % (table_dir, id)
            file_state = parse.filemerge(file_name)

            # Get state dict from the DB
            db_state = db.get(id)
            if db_state is None:
                # No such dict exist, we have to insert it, with the right id
                file_state['jobman.id'] = id
                db.insert(file_state, session=session)
            else:
                db_state.update_in_session(file_state, session=session)
                pass
    finally:
        session.close()
def runner_sqlreload(options, dbdescr, table_dir, *ids):
    """
    Put data in the experiment directory back in the the sql db.

    Usefull in case you delete the db or part of it.

    Example use:

        jobman sqlreload [--all] postgres://user:pass@host[:port]/dbname?table=tablename ~/expdir/dbname/tablename 10 11
    """
    if table_dir[-1] == os.path.sep:
        table_dir = table_dir[:-1]

    db = open_db(dbdescr, serial=True)

    assert os.path.split(table_dir)[-1] == db.tablename
    assert os.path.split(os.path.split(table_dir)[0])[-1] == db.dbname
    expdir = os.path.split(os.path.split(table_dir)[0])[0]

    if options.all:
        assert len(ids) == 0
        ids = []
        for p in os.listdir(table_dir):
            try:
                ids += [int(p)]
            except ValueError:
                print 'Skipping entry %s, as it is not a jobman id.' % p
    else:
        # Ensure that ids are all integers.
        ids = [int(d) for d in ids]

    try:
        session = db.session()
        for id in ids:
            # Get state dict from the file
            file_name = '%s/%i/current.conf' % (table_dir, id)
            file_state = parse.filemerge(file_name)

            # Get state dict from the DB
            db_state = db.get(id)
            if db_state is None:
                # No such dict exist, we have to insert it, with the right id
                file_state['jobman.id'] = id
                db.insert(file_state, session=session)
            else:
                db_state.update_in_session(file_state, session=session)
                pass
    finally:
        session.close()
def runner_sqlschedule(options, dbdescr, experiment, *strings):
    """
    Schedule a job to run using the sql command.

    Usage: sqlschedule <tablepath> <experiment> <parameters>

    See the experiment and parameters topics for more information about
    these parameters.

    Assuming that a postgres database is running on port `port` of
    `host`, contains a database called `dbname` and that `user` has the
    permissions to create, read and modify tables on that database,
    tablepath should be of the following form:

        postgres://user:pass@host[:port]/dbname?table=tablename

    If no table is named `tablename`, one will be created
    automatically. The state corresponding to the experiment and
    parameters specified in the command will be saved in the database,
    but no experiment will be run.

    To run an experiment scheduled using sqlschedule, see the sql
    command.

    Example use:
        jobman sqlschedule postgres://user:pass@host[:port]/dbname?table=tablename \\
            mymodule.my_experiment \\
            stopper::pylearn.stopper.nsteps \\ # use pylearn.stopper.nsteps
            stopper.n=10000 \\ # the argument "n" of nsteps is 10000
            lr=0.03

        you can use the jobman.experiments.example1 as a working
        mymodule.my_experiment
    """
    db = open_db(dbdescr, serial=True)

    parser = getattr(parse, options.parser, None) or resolve(options.parser)

    state = parser(*strings)
    resolve(
        experiment)  # we try to load the function associated to the experiment
    state['jobman.experiment'] = experiment
    sql.add_experiments_to_db([state], db, verbose=1, force_dup=options.force)
Ejemplo n.º 8
0
def runner_sqlschedule(options, dbdescr, experiment, *strings):
    """
    Schedule a job to run using the sql command.

    Usage: sqlschedule <tablepath> <experiment> <parameters>

    See the experiment and parameters topics for more information about
    these parameters.

    Assuming that a postgres database is running on port `port` of
    `host`, contains a database called `dbname` and that `user` has the
    permissions to create, read and modify tables on that database,
    tablepath should be of the following form:

        postgres://user:pass@host[:port]/dbname?table=tablename

    If no table is named `tablename`, one will be created
    automatically. The state corresponding to the experiment and
    parameters specified in the command will be saved in the database,
    but no experiment will be run.

    To run an experiment scheduled using sqlschedule, see the sql
    command.

    Example use:
        jobman sqlschedule postgres://user:pass@host[:port]/dbname?table=tablename \\
            mymodule.my_experiment \\
            stopper::pylearn.stopper.nsteps \\ # use pylearn.stopper.nsteps
            stopper.n=10000 \\ # the argument "n" of nsteps is 10000
            lr=0.03

        you can use the jobman.experiments.example1 as a working
        mymodule.my_experiment
    """
    db = open_db(dbdescr, serial=True)

    parser = getattr(parse, options.parser, None) or resolve(options.parser)

    state = parser(*strings)
    # we try to load the function associated to the experiment
    resolve(experiment)
    state['jobman.experiment'] = experiment
    sql.add_experiments_to_db([state], db, verbose=1, force_dup=options.force)
def runner_sqlview(options, dbdescr, viewname):
    """
    Create/drop a view of the scheduled experiments.

    Usage: jobman sqlview <tablepath> <viewname>

    The jobs should be scheduled first with the sqlschedule command.
    Also, it is more interesting to execute it after some experiment have
    finished.

    Assuming that a postgres database is running on port `port` of
    `host`, contains a database called `dbname` and that `user` has the
    permissions to create, read and modify tables on that database,
    tablepath should be of the following form:

        postgres://user:pass@host[:port]/dbname?table=tablename


    Example use:
        That was executed and at least one exeperiment was finished.
        jobman sqlschedule postgres://user:pass@host[:port]/dbname?table=tablename \\
            mymodule.my_experiment \\
            stopper::pylearn.stopper.nsteps \\ # use pylearn.stopper.nsteps
            stopper.n=10000 \\ # the argument "n" of nsteps is 10000
            lr=0.03
        Now this will create a view with a columns for each parameter and
        key=value set in the state by the jobs.
        jobman sqlview postgres://user:pass@host[:port]/dbname?table=tablename viewname

        you can use the jobman.experiments.example1 as a working
        mymodule.my_experiment
    """
    db = open_db(dbdescr, serial=True)

    if options.drop:
        db.dropView(viewname, not options.quiet)
    else:
        db.createView(viewname, not options.quiet)
Ejemplo n.º 10
0
def runner_sqlview(options, dbdescr, viewname):
    """
    Create/drop a view of the scheduled experiments.

    Usage: jobman sqlview <tablepath> <viewname>

    The jobs should be scheduled first with the sqlschedule command.
    Also, it is more interesting to execute it after some experiment have
    finished.

    Assuming that a postgres database is running on port `port` of
    `host`, contains a database called `dbname` and that `user` has the
    permissions to create, read and modify tables on that database,
    tablepath should be of the following form:

        postgres://user:pass@host[:port]/dbname?table=tablename


    Example use:
        That was executed and at least one exeperiment was finished.
        jobman sqlschedule postgres://user:pass@host[:port]/dbname?table=tablename \\
            mymodule.my_experiment \\
            stopper::pylearn.stopper.nsteps \\ # use pylearn.stopper.nsteps
            stopper.n=10000 \\ # the argument "n" of nsteps is 10000
            lr=0.03
        Now this will create a view with a columns for each parameter and
        key=value set in the state by the jobs.
        jobman sqlview postgres://user:pass@host[:port]/dbname?table=tablename viewname

        you can use the jobman.experiments.example1 as a working
        mymodule.my_experiment
    """
    db = open_db(dbdescr, serial=True)

    if options.drop:
        db.dropView(viewname, not options.quiet)
    else:
        db.createView(viewname, not options.quiet)
Ejemplo n.º 11
0
def cachesync_runner(options, dir):
    """
    Syncs the working directory of jobs with remote cache.

    Usage: cachesync [options] <path_to_job(s)_workingdir(s)>

    (For this to work, though, you need to do a channel.save() at least
    once in your job before calling cachesync, otherwise the host_name
    and host_workdir won't be set in current.conf)

    For the purpose of this command, see below.

    It can either sync a single directory, which must contain "current.conf"
    file which specifies the remote host and directory. Example for a single
    directory:

        # this syncs the current directory
        jobman cachesync .

        # this syncs another directory
        jobman cachesync myexperiment/mydbname/mytablename/5

    It can also sync all subdirectories of the directory you specify.
    You must use the -m (or --multiple) option for this.
    Each subdirectory (numbered 1, 2 ... etc based on job number) must
    contain a "current.conf" file specifying the remote host and directory.
    Examples:

        # syncs all subdirectories 1, 2 ...
        jobman cachesync -m myexperiment/mydbname/mytablename 

    Normally completed jobs (status = DONE) won't be synced based on
    the "status" set in current.conf. Yet you can force sync by using
    the -f or --force option.

    --sql=dbdesc is an option that allow to get from the db missing info from
    the current.conf file. Same syntax as the sql command.

    Purpose of this command
    -----------------------

    To clarify the purpose of the cachesync command: when launching jobs, 
    working directories are created for each job. For example, when launching:

    dbidispatch jobman sql 'postgres://user@gershwin/mydatabase?table=mytable' .

    A directory ``mydatabase`` with subdirectory ``mytable``.

    will be created, containing further subdirectories numbered 1, 2 and 3 
    (based on job id's in the DB). These directories are the working 
    directories of each job. They contain a copy of the stdout and stderr of 
    the job, along with copies of the jobman state (dictionaries in .conf 
    files) and further files created by the job.

    Yet the content of those directories is not updated live during the job. 
    The job runs on a cluster node, and those files are first written to a 
    temporary directory on the node itself. Then, when calling channel.save() 
    or when the job finishes, they're rsync'ed over to the working directory 
    where they should be.

    This is annoying since one can't see how the jobs are doing unless he 
    SSH'es into the cluster node and finds the temporary directory. To 
    alleviate this problem, the cachesync commands copies over the files to 
    the working directory whenever asked to, so it's easier to probe the 
    running jobs state. 
    """
    force = options.force
    multiple = options.multiple
    dbdesc = options.sql
    all_jobs = None
    if dbdesc:
        import api0
        db = api0.open_db(dbdesc, serial=True)

        try:
            session = db.session()
            q = db.query(session)
            all_jobs = q.all()
        finally:
            try:
                session.close()
            except:
                pass

    if multiple:
        sync_all_directories(dir, all_jobs, force)
    else:
        sync_single_directory(dir, all_jobs, force)
Ejemplo n.º 12
0
def check_serve(options, dbdescr):
    """Check that all jobs marked as running in the db are marked as
    running in some cluster jobs scheduler.

    print jobs that could have crashed/been killed ...

    Example usage:

        jobman check <tablepath>

    """

    db = open_db(dbdescr, serial=True)

    try:
        session = db.session()
        q = db.query(session)
        idle = q.filter_eq('jobman.status', 0).all()
        running = q.filter_eq('jobman.status', 1).all()
        finished = q.filter_eq('jobman.status', 2).all()
        err_start = q.filter_eq('jobman.status', 3).all()
        err_sync = q.filter_eq('jobman.status', 4).all()
        err_run = q.filter_eq('jobman.status', 5).all()
        canceled = q.filter_eq('jobman.status', -1).all()
        info = []

        print ("I: number of job by status (%d:START, %d:RUNNING, %d:DONE,"
               " %d:ERR_START, %d:ERR_SYNC, %d:ERR_RUN, %d:CANCELED)"
               " in the db (%d:TOTAL)" % (len(idle), len(running),
                                          len(finished), len(err_start),
                                          len(err_sync), len(err_run),
                                          len(canceled), len(q.all())))
        print

        #warn about job in error status
        if len(err_start):
            print "E: The following jobs had an error when starting them",
            print [j.id for j in err_start]
        if len(err_sync):
            print "E: The following jobs had an error while doing the rsync",
            print [j.id for j in err_sync]
        if len(err_run):
            print "E: The following jobs had an error while running",
            print [j.id for j in err_run]
        print

        #check not 2 jobs in same slot+host
        host_slot = {}
        now = time.time()

        #check job still running
        for idx, r in enumerate(running):
            condor_job = False
            sge_job = False
            pbs_job = False

            #find the backend used for the job.
            if ("jobman.sql.condor_slot" in r.keys() and
                r["jobman.sql.condor_slot"] != "no_condor_slot"):
                condor_job = True
            if "jobman.sql.sge_task_id" in r.keys():
                sge_job = True
            if "jobman.sql.pbs_task_id" in r.keys():
                pbs_job = True
            if (sge_job + condor_job + pbs_job) > 1:
                print "W: Job %d have info such that it run on condor, sge and/or pbs. We can't determine the good one."
                continue
            if not (sge_job or condor_job or pbs_job):
                print "W: Job %d don't have condor, sge or pbs info attached to it. We can't determine if it is still running on the cluster. Old jobman to started the job?" % r.id
                continue

            #check that the job is still running.
            if sge_job:
                check_running_sge_jobs(r, now)
                continue

            if pbs_job:
                check_running_pbs_jobs(r, now)
                continue

            if not condor_job:
                print "W: Job %d is running but don't have the information needed to check if they still run on the jobs scheduler condor/pbs/torque/sge. Possible reasons: the job started with an old version of jobman or on another jobs scheduler."%r.id
                continue

            # We suppose the jobs started on condor.
            try:
                h = r["jobman.sql.host_name"]
                s = r["jobman.sql.condor_slot"]
            except KeyError, e:
                print "W: Job %d is running but don't have needed info to check them again condor. Possible reaons: the job started with an old version of jobman or without condor."%r.id
                continue
            st = s + '@' + h
            if host_slot.has_key(st):
                try:
                    t0 = str_time(now - running[host_slot[st]]["jobman.sql.start_time"])
                except KeyError:
                    t0 = 'NO_START_TIME'
                try:
                    t1 = str_time(now - r["jobman.sql.start_time"])
                except KeyError:
                    t1 = 'NO_START_TIME'
                print 'E: Job %d and Job %d are running on the same condor slot/host combination. running time: %s and %s'%(running[host_slot[st]].id,r.id,t0,t1)
            else:
                host_slot[st]=idx

            gjid = None
            if "jobman.sql.condor_global_job_id" in r.keys():
                gjid = r["jobman.sql.condor_global_job_id"]
            elif "jobman.sql.condor_GlobalJobId" in r.keys():
                gjid = r["jobman.sql.condor_GlobalJobId"]
            if gjid is not None:
                submit_host = gjid.split('#')[0]

                #import pdb;pdb.set_trace()
                #take care of the quotation, condor resquest that "" be used
                #around string.
                cmd = "condor_q -name %s -const 'GlobalJobId==\"%s\"' -format '%%s' 'JobStatus'"%(submit_host,gjid)
                p = Popen(cmd, shell=True, stdout=PIPE)
                ret = p.wait();
                lines = p.stdout.readlines()

                if ret == 127 and len(lines) == 0:
                    print "W: Job %d. condor_q failed. Is condor installed on this computer?"%r.id
                    continue

                if len(lines) == 0:
                    print "E: Job %d is marked as running in the bd on this condor jobs %s, but condor tell that this jobs is finished"%(r.id,gjid)
                    continue
                elif len(lines) == 1:
                    if lines[0] == '0':  # condor unexpanded??? What should we do?
                        print "E: Job %d is marked as running in the db, but its condor submited job is marked as unexpanded. We don't know what that mean, so we use an euristic to know if the jobs is still running."%r.id
                    elif lines[0] == '1':  # condor idle
                        print "E: Job %d is marked as running in the db, but its condor submited job is marked as idle. This can mean that the computer that was running this job crashed."%r.id
                        continue
                    elif lines[0] == '2':  # condor running
                        continue
                    elif lines[0] == '3':  # condor removed
                        print "E: Job %d is marked as running in the db, but its condor submited job is marked as removed."%r.id
                    elif lines[0] == '4':  # condor completed
                        print "E: Job %d is marked as running in the db, but its condor submited job is marked as completed."%r.id
                    elif lines[0] == '5':  # condor held
                        print "E: Job %d is marked as running in the db, but its condor submited job is marked as held."%r.id
                    elif lines[0] == '6':  # condor submission error
                        print "E: Job %d is marked as running in the db, but its condor submited job is marked as submission error(SHOULD not happen as if condor can't start the job, it don't select one in the db)."%r.id

                else:
                    print "W: condor return a not understood answer to a query. We will try some euristic to determine if it is running. test command `%s`. stdout returned `%s`"%(cmd,lines)
    #except KeyError:
    #            pass
            info = (r.id,
                    r["jobman.experiment"],
                    r["jobman.sql.condor_slot"],
                    r["jobman.sql.host_name"],
                    r["jobman.sql.start_time"])
            run_time = str_time(now - info[4])

            if info[2] == "no_condor_slot":
                print "W: Job %d is not running on condor(Should not happed...)"%info[0]
            else:
                p = Popen('''condor_status -constraint 'Name == "slot%s@%s"' -format "%%s" Name -format " %%s" State -format " %%s" Activity -format " %%s" RemoteUser -format " %%s\n" RemoteOwner''' % (info[2], info[3]),
                        shell=True, stdout=PIPE)
                p.wait()
                lines = p.stdout.readlines()
                #return when running: [email protected] Claimed Busy bastienf bastienf
                #return when don't exist: empty
                if len(lines) == 0:
                    print "W: Job %d is running on a host(%s) that condor lost connection with. The job run for: %s"%(r.id, info[3], run_time)
                    continue
                elif len(lines) != 1 and not (len(lines) == 2 and lines[-1] == '\n'):
                    print "W: Job %d condor_status return not understood: ",lines
                    continue
                sp = lines[0].split()
                if len(sp) >= 3 and sp[1] in ["Unclaimed", "Owner"] and sp[2] == "Idle":
                    print "E: Job %d db tell that this job is running on %s. condor tell that this host don't run a job. running time %s"%(r.id,info[3],run_time)
                elif len(sp) == 5:
                    assert sp[0] == "slot%s@%s" % (info[2], info[3])
                    if sp[3] != sp[4]:
                        print "W: Job %d condor_status return not understood: ",lines
                    if sp[1] == "Claimed" and sp[2] in ["Busy", "Retiring"]:
                        if sp[4].split('@')[0] == os.getenv("USER"):
                            print "W: Job %d is running on a condor host that is running a job of the same user. running time: %s"%(r.id,run_time)
                        else:
                            print "E: Job %d is running on a condor host that is running a job for user %s. running time: %s"%(r.id,sp[4].split('@')[0],run_time)
                    else:
                        print "W: Job %d condor state of host not understood"%r.id,sp
                else:
                    print "W: Job %d condor_status return not understood: ",lines

    finally:
        session.close()
Ejemplo n.º 13
0
def check_serve(options, dbdescr):
    """Check that all jobs marked as running in the db are marked as
    running in some cluster jobs scheduler.

    print jobs that could have crashed/been killed ...

    Example usage:

        jobman check <tablepath>

    """

    db = open_db(dbdescr, serial=True)

    try:
        session = db.session()
        q = db.query(session)
        idle = q.filter_eq('jobman.status', 0).all()
        running = q.filter_eq('jobman.status', 1).all()
        finished = q.filter_eq('jobman.status', 2).all()
        err_start = q.filter_eq('jobman.status', 3).all()
        err_sync = q.filter_eq('jobman.status', 4).all()
        err_run = q.filter_eq('jobman.status', 5).all()
        canceled = q.filter_eq('jobman.status', -1).all()
        info = []

        print ("I: number of job by status (%d:START, %d:RUNNING, %d:DONE,"
               " %d:ERR_START, %d:ERR_SYNC, %d:ERR_RUN, %d:CANCELED)"
               " in the db (%d:TOTAL)" % (len(idle), len(running),
                                          len(finished), len(err_start),
                                          len(err_sync), len(err_run),
                                          len(canceled), len(q.all())))
        print

        # warn about job in error status
        if len(err_start):
            print "E: The following jobs had an error when starting them",
            print [j.id for j in err_start]
        if len(err_sync):
            print "E: The following jobs had an error while doing the rsync",
            print [j.id for j in err_sync]
        if len(err_run):
            print "E: The following jobs had an error while running",
            print [j.id for j in err_run]
        print

        # check not 2 jobs in same slot+host
        host_slot = {}
        now = time.time()

        # check job still running
        for idx, r in enumerate(running):
            condor_job = False
            sge_job = False
            pbs_job = False

            # find the backend used for the job.
            if ("jobman.sql.condor_slot" in r.keys() and
                    r["jobman.sql.condor_slot"] != "no_condor_slot"):
                condor_job = True
            if "jobman.sql.sge_task_id" in r.keys():
                sge_job = True
            if "jobman.sql.pbs_task_id" in r.keys():
                pbs_job = True
            if (sge_job + condor_job + pbs_job) > 1:
                print "W: Job %d have info such that it run on condor, sge and/or pbs. We can't determine the good one."
                continue
            if not (sge_job or condor_job or pbs_job):
                print "W: Job %d don't have condor, sge or pbs info attached to it. We can't determine if it is still running on the cluster. Old jobman to started the job?" % r.id
                continue

            # check that the job is still running.
            if sge_job:
                check_running_sge_jobs(r, now)
                continue

            if pbs_job:
                check_running_pbs_jobs(r, now)
                continue

            if not condor_job:
                print "W: Job %d is running but don't have the information needed to check if they still run on the jobs scheduler condor/pbs/torque/sge. Possible reasons: the job started with an old version of jobman or on another jobs scheduler." % r.id
                continue

            # We suppose the jobs started on condor.
            try:
                h = r["jobman.sql.host_name"]
                s = r["jobman.sql.condor_slot"]
            except KeyError, e:
                print "W: Job %d is running but don't have needed info to check them again condor. Possible reaons: the job started with an old version of jobman or without condor." % r.id
                continue
            st = s + '@' + h
            if host_slot.has_key(st):
                try:
                    t0 = str_time(
                        now - running[host_slot[st]]["jobman.sql.start_time"])
                except KeyError:
                    t0 = 'NO_START_TIME'
                try:
                    t1 = str_time(now - r["jobman.sql.start_time"])
                except KeyError:
                    t1 = 'NO_START_TIME'
                print 'E: Job %d and Job %d are running on the same condor slot/host combination. running time: %s and %s' % (running[host_slot[st]].id, r.id, t0, t1)
            else:
                host_slot[st] = idx

            gjid = None
            if "jobman.sql.condor_global_job_id" in r.keys():
                gjid = r["jobman.sql.condor_global_job_id"]
            elif "jobman.sql.condor_GlobalJobId" in r.keys():
                gjid = r["jobman.sql.condor_GlobalJobId"]
            if gjid is not None:
                submit_host = gjid.split('#')[0]

                #import pdb;pdb.set_trace()
                # take care of the quotation, condor resquest that "" be used
                # around string.
                cmd = "condor_q -name %s -const 'GlobalJobId==\"%s\"' -format '%%s' 'JobStatus'" % (
                    submit_host, gjid)
                p = Popen(cmd, shell=True, stdout=PIPE)
                ret = p.wait()
                lines = p.stdout.readlines()

                if ret == 127 and len(lines) == 0:
                    print "W: Job %d. condor_q failed. Is condor installed on this computer?" % r.id
                    continue

                if len(lines) == 0:
                    print "E: Job %d is marked as running in the bd on this condor jobs %s, but condor tell that this jobs is finished" % (r.id, gjid)
                    continue
                elif len(lines) == 1:
                    # condor unexpanded??? What should we do?
                    if lines[0] == '0':
                        print "E: Job %d is marked as running in the db, but its condor submited job is marked as unexpanded. We don't know what that mean, so we use an euristic to know if the jobs is still running." % r.id
                    elif lines[0] == '1':  # condor idle
                        print "E: Job %d is marked as running in the db, but its condor submited job is marked as idle. This can mean that the computer that was running this job crashed." % r.id
                        continue
                    elif lines[0] == '2':  # condor running
                        continue
                    elif lines[0] == '3':  # condor removed
                        print "E: Job %d is marked as running in the db, but its condor submited job is marked as removed." % r.id
                    elif lines[0] == '4':  # condor completed
                        print "E: Job %d is marked as running in the db, but its condor submited job is marked as completed." % r.id
                    elif lines[0] == '5':  # condor held
                        print "E: Job %d is marked as running in the db, but its condor submited job is marked as held." % r.id
                    elif lines[0] == '6':  # condor submission error
                        print "E: Job %d is marked as running in the db, but its condor submited job is marked as submission error(SHOULD not happen as if condor can't start the job, it don't select one in the db)." % r.id

                else:
                    print "W: condor return a not understood answer to a query. We will try some euristic to determine if it is running. test command `%s`. stdout returned `%s`" % (cmd, lines)
    # except KeyError:
    #            pass
            info = (r.id,
                    r["jobman.experiment"],
                    r["jobman.sql.condor_slot"],
                    r["jobman.sql.host_name"],
                    r["jobman.sql.start_time"])
            run_time = str_time(now - info[4])

            if info[2] == "no_condor_slot":
                print "W: Job %d is not running on condor(Should not happed...)" % info[0]
            else:
                p = Popen('''condor_status -constraint 'Name == "slot%s@%s"' -format "%%s" Name -format " %%s" State -format " %%s" Activity -format " %%s" RemoteUser -format " %%s\n" RemoteOwner''' % (info[2], info[3]),
                          shell=True, stdout=PIPE)
                p.wait()
                lines = p.stdout.readlines()
                # return when running: [email protected] Claimed Busy bastienf bastienf
                # return when don't exist: empty
                if len(lines) == 0:
                    print "W: Job %d is running on a host(%s) that condor lost connection with. The job run for: %s" % (r.id, info[3], run_time)
                    continue
                elif len(lines) != 1 and not (len(lines) == 2 and lines[-1] == '\n'):
                    print "W: Job %d condor_status return not understood: ", lines
                    continue
                sp = lines[0].split()
                if len(sp) >= 3 and sp[1] in ["Unclaimed", "Owner"] and sp[2] == "Idle":
                    print "E: Job %d db tell that this job is running on %s. condor tell that this host don't run a job. running time %s" % (r.id, info[3], run_time)
                elif len(sp) == 5:
                    assert sp[0] == "slot%s@%s" % (info[2], info[3])
                    if sp[3] != sp[4]:
                        print "W: Job %d condor_status return not understood: ", lines
                    if sp[1] == "Claimed" and sp[2] in ["Busy", "Retiring"]:
                        if sp[4].split('@')[0] == os.getenv("USER"):
                            print "W: Job %d is running on a condor host that is running a job of the same user. running time: %s" % (r.id, run_time)
                        else:
                            print "E: Job %d is running on a condor host that is running a job for user %s. running time: %s" % (r.id, sp[4].split('@')[0], run_time)
                    else:
                        print "W: Job %d condor state of host not understood" % r.id, sp
                else:
                    print "W: Job %d condor_status return not understood: ", lines

    finally:
        session.close()
def runner_sqlstatus(options, dbdescr, *ids):
    """Show the status of jobs. Option allow to change it.

    The --resert_prio option set the priority of the jobs back to the
    default value.

    Example use:

        jobman sqlstatus postgres://user:pass@host[:port]/dbname?table=tablename 10 11

    """
    #we don't want to remove all output when we change the db.
    if options.set_status and options.ret_nb_jobs:
        raise UsageError(
            "The option --set_status and --ret_nb_jobs are mutually exclusive."
        )

    db = open_db(dbdescr, serial=True)

    if options.set_status:
        try:
            new_status = to_status_number(options.set_status)
        except ValueError:
            raise UsageError(
                "The option --set_status accept only the value START, RUNNING, DONE, ERR_START, ERR_SYNC, ERR_RUN, CANCELED or their equivalent int number"
            )
    else:
        new_status = None

    have_running_jobs = False
    verbose = not options.quiet
    if options.ret_nb_jobs:
        verbose = 0
    else:
        verbose += 1
    ids = list(ids)
    try:
        session = db.session()

        if options.print_keys:
            q = db.query(session)
            job = q.first()
            print "Keys in the state of the first jobs",
            for k in job.keys():
                print k,
            print
            del q, job, k

        if options.status:
            q = db.query(session)
            jobs = []
            for stat in options.status:
                jobs += q.filter_eq('jobman.status',
                                    to_status_number(stat)).all()

            ids.extend([j.id for j in jobs])
            del jobs, q

        if options.select:
            q = db.query(session)
            j = q.first()
            for param in options.select:
                k, v = param.split('=')
                if k == 'jobman.status':
                    q = q.filter_eq(k, to_status_number(v))
                elif isinstance(j[k], (str, unicode)):
                    q = q.filter_eq(k, v)
                elif isinstance(j[k], float):
                    q = q.filter_eq(k, float(v))
                elif isinstance(j[k], int):
                    q = q.filter_eq(k, int(v))
                else:
                    q = q.filter_eq(k, repr(v))
            jobs = q.all()
            ids.extend([j.id for j in jobs])
            del j, jobs, q

        if options.fselect:
            q = db.query(session)
            jobs = q.all()
            for param in options.fselect:
                k, v = param.split('=', 1)
                f = eval(v)
                for job in jobs:
                    if k in job:
                        if f(job[k]):
                            ids.append(job.id)
                    else:
                        print "job", job.id, "don't have the attribute", k

            del job, jobs, q

        if options.all:
            q = db.query(session)
            jobs = q.all()
            ids.extend([j.id for j in jobs])
            del q, jobs

        # Remove all dictionaries from the session
        session.expunge_all()

        ids = [int(id) for id in ids]
        ids = list(set(ids))
        ids.sort()
        nb_jobs = len(ids)

        for id in ids:
            job = db.get(id)
            if job is None:
                if verbose > 0:
                    print "Job id %s don't exit in the db" % (id)
                nb_jobs -= 1
                continue
            try:
                prio = job['jobman.sql.priority']
            except Exception:
                prio = 'BrokenDB_priority_DontExist'
            try:
                status = job['jobman.status']
            except KeyError:
                status = 'BrokenDB_Status_DontExist'

            if verbose > 1:
                print "Job id %s, status=%d jobman.sql.priority=%s" % (
                    id, status, str(prio)),

                for p in options.prints:
                    try:
                        print '%s=%s' % (p, job[p]),
                    except KeyError:
                        print '%s=KeyDontExist' % (p),
                print

            if status == RUNNING:
                have_running_jobs = True
            if options.set_status:
                job.__setitem__('jobman.status', new_status, session)
                job.update_in_session({}, session)
            if options.reset_prio:
                job.__setitem__('jobman.sql.priority', 1.0, session)
                job.update_in_session({}, session)

        if options.set_status:
            session.commit()
            print "Changed the status to %d for %d jobs" % (new_status,
                                                            len(ids))
        if options.reset_prio:
            print "Reseted the priority to the default value"
        if new_status == CANCELED and have_running_jobs:
            print "WARNING: Canceled jobs only change the status in the db. Jobs that are already running, will continue to run. If the job finish with status COMPLETE, it will change the status to DONE. Otherwise the status won't be changed"

    finally:
        session.close()

    if options.ret_nb_jobs:
        print nb_jobs
def runner_sql(options, dbdescr, exproot):
    """
    Run jobs from a sql table.

    Usage: sql [options] <tablepath> <exproot>

    The jobs should be scheduled first with the sqlschedule command.

    Assuming that a postgres database is running on port `port` of
    `host`, contains a database called `dbname` and that `user` has the
    permissions to create, read and modify tables on that database,
    tablepath should be of the following form:

        postgres://user:pass@host[:port]/dbname?table=tablename

    exproot can be a local path or a remote path. Examples of exproots:
        /some/local/path
        ssh://some_host:/some/remote/path # relative to the filesystem root
        ssh://some_host:other/remote/path # relative to the HOME on some_host

    The exproot will contain a subdirectory hierarchy corresponding to
    the dbname, tablename and job id which is a unique integer.

    The sql runner will pick any job in the table which is not running
    and is not done and will terminate when that job ends. You may call
    the same command multiple times, sequentially or in parallel, to
    run as many unfinished jobs as have been scheduled in that table
    with sqlschedule.

    Example use:
        jobman sql \\
            postgres://user:pass@host[:port]/dbname?table=tablename \\
            ssh://central_host:myexperiments
    """
    if options.modules:
        modules = options.modules.split(',')
    else:
        modules = []
    for module in modules:
        __import__(module, fromlist=[])

    db = open_db(dbdescr, serial=True)
    n = options.n if options.n else -1
    nrun = 0
    try:
        while n != 0:
            if options.workdir:
                workdir = options.workdir
            else:
                if options.workdir_dir and not os.path.exists(
                        options.workdir_dir):
                    os.mkdir(options.workdir_dir)
                workdir = tempfile.mkdtemp(dir=options.workdir_dir)
            print "The working directory is:", os.path.join(
                os.getcwd(), workdir)

            channel = DBRSyncChannel(db,
                                     workdir,
                                     exproot,
                                     redirect_stdout=True,
                                     redirect_stderr=True,
                                     finish_up_after=options.finish_up_after
                                     or None,
                                     save_interval=options.save_every or None)
            channel.run()

            # Useful for manual tests; leave this there, just commented.
            #cachesync_runner.manualtest_before_delete()
            with cachesync_lock(None, workdir):
                # Useful for manual tests; leave this there, just
                #commented.  cachesync_runner.manualtest_will_delete()

                shutil.rmtree(workdir, ignore_errors=True)

            n -= 1
            nrun += 1
    except JobError, e:
        if e.args[0] == JobError.NOJOB:
            print 'No more jobs to run (run %i jobs)' % nrun
Ejemplo n.º 16
0
def runner_sql(options, dbdescr, exproot):
    """
    Run jobs from a sql table.

    Usage: sql [options] <tablepath> <exproot>

    The jobs should be scheduled first with the sqlschedule command.

    Assuming that a postgres database is running on port `port` of
    `host`, contains a database called `dbname` and that `user` has the
    permissions to create, read and modify tables on that database,
    tablepath should be of the following form:

        postgres://user:pass@host[:port]/dbname?table=tablename

    exproot can be a local path or a remote path. Examples of exproots:
        /some/local/path
        ssh://some_host:/some/remote/path # relative to the filesystem root
        ssh://some_host:other/remote/path # relative to the HOME on some_host

    The exproot will contain a subdirectory hierarchy corresponding to
    the dbname, tablename and job id which is a unique integer.

    The sql runner will pick any job in the table which is not running
    and is not done and will terminate when that job ends. You may call
    the same command multiple times, sequentially or in parallel, to
    run as many unfinished jobs as have been scheduled in that table
    with sqlschedule.

    Example use:
        jobman sql \\
            postgres://user:pass@host[:port]/dbname?table=tablename \\
            ssh://central_host:myexperiments
    """
    if options.modules:
        modules = options.modules.split(',')
    else:
        modules = []
    for module in modules:
        __import__(module, fromlist=[])

    db = open_db(dbdescr, serial=True)
    n = options.n if options.n else -1
    nrun = 0
    try:
        while n != 0:
            if options.workdir:
                workdir = options.workdir
            else:
                if options.workdir_dir and not os.path.exists(options.workdir_dir):
                    os.mkdir(options.workdir_dir)
                workdir = tempfile.mkdtemp(dir=options.workdir_dir)
            print "The working directory is:", os.path.join(os.getcwd(), workdir)

            channel = DBRSyncChannel(db,
                                     workdir,
                                     exproot,
                                     redirect_stdout=True,
                                     redirect_stderr=True,
                                     finish_up_after=options.finish_up_after or None,
                                     save_interval=options.save_every or None
                                     )
            channel.run()

            # Useful for manual tests; leave this there, just commented.
            # cachesync_runner.manualtest_before_delete()
            with cachesync_lock(None, workdir):
                # Useful for manual tests; leave this there, just
                #commented.  cachesync_runner.manualtest_will_delete()

                shutil.rmtree(workdir, ignore_errors=True)

            n -= 1
            nrun += 1
    except JobError, e:
        if e.args[0] == JobError.NOJOB:
            print 'No more jobs to run (run %i jobs)' % nrun
Ejemplo n.º 17
0
def runner_sqlstatus(options, dbdescr, *ids):
    """Show the status of jobs. Option allow to change it.

    The --resert_prio option set the priority of the jobs back to the
    default value.

    Example use:

        jobman sqlstatus postgres://user:pass@host[:port]/dbname?table=tablename 10 11

    """
    # we don't want to remove all output when we change the db.
    if options.set_status and options.ret_nb_jobs:
        raise UsageError(
            "The option --set_status and --ret_nb_jobs are mutually exclusive.")

    db = open_db(dbdescr, serial=True)

    if options.set_status:
        try:
            new_status = to_status_number(options.set_status)
        except ValueError:
            raise UsageError(
                "The option --set_status accept only the value START, RUNNING, DONE, ERR_START, ERR_SYNC, ERR_RUN, CANCELED or their equivalent int number")
    else:
        new_status = None

    have_running_jobs = False
    verbose = not options.quiet
    if options.ret_nb_jobs:
        verbose = 0
    else:
        verbose += 1
    ids = list(ids)
    try:
        session = db.session()

        if options.print_keys:
            q = db.query(session)
            job = q.first()
            print "Keys in the state of the first jobs",
            for k in job.keys():
                print k,
            print
            del q, job, k

        if options.status:
            q = db.query(session)
            jobs = []
            for stat in options.status:
                jobs += q.filter_eq('jobman.status',
                                    to_status_number(stat)).all()

            ids.extend([j.id for j in jobs])
            del jobs, q

        if options.select:
            q = db.query(session)
            j = q.first()
            for param in options.select:
                k, v = param.split('=')
                if k == 'jobman.status':
                    q = q.filter_eq(k, to_status_number(v))
                elif isinstance(j[k], (str, unicode)):
                    q = q.filter_eq(k, v)
                elif isinstance(j[k], float):
                    q = q.filter_eq(k, float(v))
                elif isinstance(j[k], int):
                    q = q.filter_eq(k, int(v))
                else:
                    q = q.filter_eq(k, repr(v))
            jobs = q.all()
            ids.extend([j.id for j in jobs])
            del j, jobs, q

        if options.fselect:
            q = db.query(session)
            jobs = q.all()
            for param in options.fselect:
                k, v = param.split('=', 1)
                f = eval(v)
                for job in jobs:
                    if k in job:
                        if f(job[k]):
                            ids.append(job.id)
                    else:
                        print "job", job.id, "don't have the attribute", k

            del job, jobs, q

        if options.all:
            q = db.query(session)
            jobs = q.all()
            ids.extend([j.id for j in jobs])
            del q, jobs

        # Remove all dictionaries from the session
        session.expunge_all()

        ids = [int(id) for id in ids]
        ids = list(set(ids))
        ids.sort()
        nb_jobs = len(ids)

        for id in ids:
            job = db.get(id)
            if job is None:
                if verbose > 0:
                    print "Job id %s don't exit in the db" % (id)
                nb_jobs -= 1
                continue
            try:
                prio = job['jobman.sql.priority']
            except Exception:
                prio = 'BrokenDB_priority_DontExist'
            try:
                status = job['jobman.status']
            except KeyError:
                status = 'BrokenDB_Status_DontExist'

            if verbose > 1:
                print "Job id %s, status=%d jobman.sql.priority=%s" % (id, status, str(prio)),

                for p in options.prints:
                    try:
                        print '%s=%s' % (p, job[p]),
                    except KeyError:
                        print '%s=KeyDontExist' % (p),
                print

            if status == RUNNING:
                have_running_jobs = True
            if options.set_status:
                job.__setitem__('jobman.status', new_status, session)
                job.update_in_session({}, session)
            if options.reset_prio:
                job.__setitem__('jobman.sql.priority', 1.0, session)
                job.update_in_session({}, session)

        if options.set_status:
            session.commit()
            print "Changed the status to %d for %d jobs" % (new_status, len(ids))
        if options.reset_prio:
            print "Reseted the priority to the default value"
        if new_status == CANCELED and have_running_jobs:
            print "WARNING: Canceled jobs only change the status in the db. Jobs that are already running, will continue to run. If the job finish with status COMPLETE, it will change the status to DONE. Otherwise the status won't be changed"

    finally:
        session.close()

    if options.ret_nb_jobs:
        print nb_jobs