Esempio n. 1
0
def write_workflow_taskfile(config, jobnum, tasks):
    """ Write the list of wrapper executions for a single job to a file """
    taskfile = config.get_filename(
        'jobtasklist', {
            pfwdefs.PF_CURRVALS: {
                'jobnum': jobnum
            },
            'required': True,
            intgdefs.REPLACE_VARS: True
        })
    tjpad = pfwutils.pad_jobnum(jobnum)
    miscutils.coremakedirs(tjpad)
    with open(f"{tjpad}/{taskfile}", 'w') as tasksfh:
        for task in sorted(tasks, key=lambda singletask: int(singletask[0])):
            tasksfh.write(
                f"{task[0]}, {task[1]}, {task[2]}, {task[3]}, {task[4]}\n")
    return taskfile
Esempio n. 2
0
def begblock(argv):
    """Program entry point.
    """
    if argv == None:
        argv = sys.argv

    configfile = argv[0]
    config = pfwconfig.PfwConfig({'wclfile': configfile})
    config.set_block_info()
    blknum = config[pfwdefs.PF_BLKNUM]

    blkdir = config.getfull('block_dir')
    os.chdir(blkdir)

    (exists, submit_des_services) = config.search('submit_des_services')
    if exists and submit_des_services is not None:
        os.environ['DES_SERVICES'] = submit_des_services
    (exists, submit_des_db_section) = config.search('submit_des_db_section')
    if exists and submit_des_db_section is not None:
        os.environ['DES_DB_SECTION'] = submit_des_db_section

    dbh = None
    blktid = -1
    if miscutils.fwdebug_check(3, 'PFWBLOCK_DEBUG'):
        miscutils.fwdebug_print("blknum = %s" % (config[pfwdefs.PF_BLKNUM]))
    if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
        dbh = pfwdb.PFWDB(submit_des_services, submit_des_db_section)
        dbh.insert_block(config)
        blktid = config['task_id']['block'][str(blknum)]
        config['task_id']['begblock'] = dbh.create_task(
            name='begblock',
            info_table=None,
            parent_task_id=blktid,
            root_task_id=int(config['task_id']['attempt']),
            label=None,
            do_begin=True,
            do_commit=True)

    try:
        modulelist = miscutils.fwsplit(
            config.getfull(pfwdefs.SW_MODULELIST).lower())
        modules_prev_in_list = {}

        joblist = {}
        parlist = OrderedDict()
        masterdata = OrderedDict()
        filelist = {'infiles': {}, 'outfiles': {}}
        for num, modname in enumerate(modulelist):
            print("XXXXXXXXXXXXXXXXXXXX %s XXXXXXXXXXXXXXXXXXXX" % modname)
            if modname not in config[pfwdefs.SW_MODULESECT]:
                miscutils.fwdie(
                    "Error: Could not find module description for module %s\n"
                    % (modname), pfwdefs.PF_EXIT_FAILURE)
            moddict = config[pfwdefs.SW_MODULESECT][modname]

            runqueries(config, configfile, modname, modules_prev_in_list)
            pfwblock.read_master_lists(config, modname, masterdata,
                                       modules_prev_in_list)

            (infsect, outfsect) = pfwblock.get_datasect_types(config, modname)
            pfwblock.fix_master_lists(config, modname, masterdata, outfsect)

            if pfwdefs.PF_NOOP not in moddict or not miscutils.convertBool(
                    moddict[pfwdefs.PF_NOOP]):
                pfwblock.create_fullnames(config, modname, masterdata)
                if miscutils.fwdebug_check(
                        9, 'PFWBLOCK_DEBUG') and modname in masterdata:
                    with open('%s-masterdata.txt' % modname, 'w') as fh:
                        miscutils.pretty_print_dict(masterdata[modname], fh)

                pfwblock.add_file_metadata(config, modname)
                sublists = pfwblock.create_sublists(config, modname,
                                                    masterdata)
                if sublists is not None:
                    if miscutils.fwdebug_check(3, 'PFWBLOCK_DEBUG'):
                        miscutils.fwdebug_print("sublists.keys() = %s" %
                                                (list(sublists.keys())))
                loopvals = pfwblock.get_wrapper_loopvals(config, modname)
                wrapinst = pfwblock.create_wrapper_inst(
                    config, modname, loopvals)
                wcnt = 1
                for winst in list(wrapinst.values()):
                    if miscutils.fwdebug_check(6, 'PFWBLOCK_DEBUG'):
                        miscutils.fwdebug_print("winst %d - BEG" % wcnt)
                    pfwblock.assign_data_wrapper_inst(config, modname, winst,
                                                      masterdata, sublists,
                                                      infsect, outfsect)
                    pfwblock.finish_wrapper_inst(config, modname, winst,
                                                 outfsect)
                    tempfiles = pfwblock.create_module_wrapper_wcl(
                        config, modname, winst)
                    for fl in tempfiles['infiles']:
                        if fl not in list(filelist['infiles'].keys()):
                            filelist['infiles'][fl] = num

                    for fl in tempfiles['outfiles']:
                        filelist['outfiles'][fl] = num
                    #filelist['infiles'] += tempfiles['infiles']
                    #filelist['outfiles'] += tempfiles['outfiles']
                    pfwblock.divide_into_jobs(config, modname, winst, joblist,
                                              parlist)
                    if miscutils.fwdebug_check(6, 'PFWBLOCK_DEBUG'):
                        miscutils.fwdebug_print("winst %d - %s - END" %
                                                (wcnt, etime - stime))
                    wcnt += 1
            modules_prev_in_list[modname] = True

            if miscutils.fwdebug_check(
                    9, 'PFWBLOCK_DEBUG') and modname in masterdata:
                with open('%s-masterdata.txt' % modname, 'w') as fh:
                    miscutils.pretty_print_dict(masterdata[modname], fh)

        scriptfile = pfwblock.write_runjob_script(config)

        intersect = list(
            set(filelist['infiles'].keys()) & set(filelist['outfiles'].keys()))
        finallist = []

        for fl in list(filelist['infiles'].keys()):
            if fl not in intersect:
                finallist.append(fl)
            else:
                if filelist['infiles'][fl] <= filelist['outfiles'][fl]:
                    raise Exception(
                        'Input file %s requested before it is generated.' %
                        (fl))

        if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
            missingfiles = dbh.check_files(config, finallist)
            if len(missingfiles) > 0:
                raise Exception(
                    "The following input files cannot be found in the archive:"
                    + ",".join(missingfiles))
        miscutils.fwdebug_print("Creating job files - BEG")
        for jobkey, jobdict in sorted(joblist.items()):
            jobdict['jobnum'] = pfwutils.pad_jobnum(config.inc_jobnum())
            jobdict['jobkeys'] = jobkey
            jobdict['numexpwrap'] = len(jobdict['tasks'])
            if miscutils.fwdebug_check(6, 'PFWBLOCK_DEBUG'):
                miscutils.fwdebug_print("jobnum = %s, jobkey = %s:" %
                                        (jobkey, jobdict['jobnum']))
            jobdict['tasksfile'] = write_workflow_taskfile(
                config, jobdict['jobnum'], jobdict['tasks'])
            if (len(jobdict['inlist']) > 0 and
                    config.getfull(pfwdefs.USE_HOME_ARCHIVE_OUTPUT) != 'never'
                    and 'submit_files_mvmt' in config and
                (pfwdefs.PF_DRYRUN not in config or not miscutils.convertBool(
                    config.getfull(pfwdefs.PF_DRYRUN)))):
                # get home archive info
                home_archive = config.getfull('home_archive')
                archive_info = config[pfwdefs.SW_ARCHIVESECT][home_archive]

                # load filemgmt class
                attempt_tid = config['task_id']['attempt']
                filemgmt = pfwutils.pfw_dynam_load_class(
                    dbh, config, attempt_tid, attempt_tid, "filemgmt",
                    archive_info['filemgmt'], archive_info)
                # save file information
                filemgmt.register_file_data('list', jobdict['inlist'],
                                            config['pfw_attempt_id'],
                                            attempt_tid, False, None, None)
                pfwblock.copy_input_lists_home_archive(config, filemgmt,
                                                       archive_info,
                                                       jobdict['inlist'])
                filemgmt.commit()
            jobdict['inputwcltar'] = pfwblock.tar_inputfiles(
                config, jobdict['jobnum'],
                jobdict['inwcl'] + jobdict['inlist'])
            if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
                dbh.insert_job(config, jobdict)
            pfwblock.write_jobwcl(config, jobkey, jobdict)
            if ('glidein_use_wall' in config and miscutils.convertBool(
                    config.getfull('glidein_use_wall'))
                    and 'jobwalltime' in config):
                jobdict['wall'] = config['jobwalltime']

        miscutils.fwdebug_print("Creating job files - END")

        numjobs = len(joblist)
        if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
            dbh.update_block_numexpjobs(config, numjobs)

        #if miscutils.fwdebug_check(6, 'PFWBLOCK_DEBUG'):
        #    miscutils.fwdebug_print("inputfiles: %s, %s" % (type(inputfiles), inputfiles))
        #    miscutils.fwdebug_print("outputfiles: %s, %s" % (type(outputfiles), outputfiles))
        #files2stage = set(inputfiles) - set(outputfiles)
        #pfwblock.stage_inputs(config, files2stage)

        #if pfwdefs.USE_HOME_ARCHIVE_OUTPUT in config and \
        #   config.getfull(pfwdefs.USE_HOME_ARCHIVE_OUTPUT).lower() == 'block':
        #    config['block_outputlist'] = 'potential_outputfiles.list'
        #    pfwblock.write_output_list(config, outputfiles)

        dagfile = config.get_filename('jobdag')
        pfwblock.create_jobmngr_dag(config, dagfile, scriptfile, joblist)
    except:
        retval = pfwdefs.PF_EXIT_FAILURE
        with open(configfile, 'w') as cfgfh:
            config.write(
                cfgfh)  # save config, have updated jobnum, wrapnum, etc
        if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
            dbh.end_task(config['task_id']['begblock'], retval, True)
            dbh.end_task(blktid, retval, True)
        raise

    # save config, have updated jobnum, wrapnum, etc
    with open(configfile, 'w') as cfgfh:
        config.write(cfgfh)

    (exists, dryrun) = config.search(pfwdefs.PF_DRYRUN)
    if exists and miscutils.convertBool(dryrun):
        retval = pfwdefs.PF_EXIT_DRYRUN
    else:
        retval = pfwdefs.PF_EXIT_SUCCESS
    if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
        dbh.end_task(config['task_id']['begblock'], retval, True)
    miscutils.fwdebug_print("END - exiting with code %s" % retval)

    return retval
Esempio n. 3
0
def blockpost(argv=None):
    """Program entry point.
    """
    if argv is None:
        argv = sys.argv

    # open file to catch error messages about command line
    debugfh = open('blockpost.out', 'w')
    sys.stdout = debugfh
    sys.stderr = debugfh

    print(' '.join(argv))  # print command line for debugging

    print("running on %s" % (socket.gethostname()))

    if len(argv) != 3:
        print('Usage: blockpost.py configfile retval')
        debugfh.close()
        return pfwdefs.PF_EXIT_FAILURE

    configfile = argv[1]
    retval = int(argv[2])

    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print("configfile = %s" % configfile)
    miscutils.fwdebug_print("retval = %s" % retval)

    # read sysinfo file
    config = pfwconfig.PfwConfig({'wclfile': configfile})
    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print("done reading config file")
    blockname = config.getfull('blockname')
    blkdir = config.getfull('block_dir')

    # now that have more information, can rename output file
    miscutils.fwdebug_print("getting new_log_name")
    new_log_name = config.get_filename(
        'block',
        {pfwdefs.PF_CURRVALS: {
            'flabel': 'blockpost',
            'fsuffix': 'out'
        }})
    new_log_name = "%s/%s" % (blkdir, new_log_name)
    miscutils.fwdebug_print("new_log_name = %s" % new_log_name)

    debugfh.close()
    os.chmod('blockpost.out', 0o666)
    os.rename('blockpost.out', new_log_name)
    debugfh = open(new_log_name, 'a+')
    sys.stdout = debugfh
    sys.stderr = debugfh

    os.chdir(blkdir)

    log_pfw_event(config, blockname, 'blockpost', 'j', ['posttask', retval])

    dryrun = config.getfull(pfwdefs.PF_DRYRUN)
    run = config.getfull('run')
    attid = config['pfw_attempt_id']
    reqnum = config.getfull(pfwdefs.REQNUM)
    unitname = config.getfull(pfwdefs.UNITNAME)
    attnum = config.getfull(pfwdefs.ATTNUM)
    blknum = int(config.getfull(pfwdefs.PF_BLKNUM))
    blktid = None

    msg2 = ""
    dbh = None
    job_byblk = {}
    wrap_byjob = {}
    wrap_bymod = {}
    wrapinfo = {}
    jobinfo = {}
    failedwraps = {}
    whyfailwraps = {}  # mod failures for other modname, shouldn't happen
    usedb = miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT))
    verify_files = miscutils.convertBool(config.getfull('verify_files'))
    verify_status = 0
    if verify_files and not usedb:
        print('Skipping file verification due to lack of database connection')
    if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
        sem = None
        try:
            miscutils.fwdebug_print("Connecting to DB")
            dbh = pfwdb.PFWDB(config.getfull('submit_des_services'),
                              config.getfull('submit_des_db_section'))
            if verify_files:
                curs = dbh.cursor()
                curs.execute("select root from ops_archive where name='%s'" %
                             (config.getfull('home_archive')))
                rows = curs.fetchall()
                if rows is None or len(rows) != 1:
                    raise Exception(
                        "Invalid archive name (%s).   Found %s rows in ops_archive"
                        % (config.getfull('home_archive'), len(rows)))
                root = rows[0][0]
                if not os.path.isdir(root):
                    print(
                        "Cannot read archive root directory:%s This program must be run on an NCSA machine with access to the archive storage system."
                        % (config.getfull('home_archive')))
                sem = dbsem.DBSemaphore(
                    'verify_files_10', None,
                    config.getfull('submit_des_services'),
                    config.getfull('submit_des_db_section'))
                print(
                    "\n\nVerifying archive file sizes on disk (0 is success)")
                verify_status = cu.compare(
                    dbh=dbh,
                    archive=config.getfull('home_archive'),
                    pfwid=attid,
                    filesize=True,
                    md5sum=False,
                    quick=True,
                    debug=False,
                    script=False,
                    verbose=False,
                    silent=True)
                if sem is not None:
                    del sem
                print("  Verification of files returned status %i" %
                      (verify_status))
                if verify_status != 0:
                    print(
                        "  This indicates that one or more files do not have the correct file size (based on DB entries). Run"
                    )
                    print(
                        "\n    compare_db.py --des_services %s --section %s --archive %s --pfwid %i --filesize --verbose"
                        % (config.getfull('submit_des_services'),
                           config.getfull('submit_des_db_section'),
                           config.getfull('home_archive'), int(attid)))
                    print("\n  to see the details.")

            if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_QCF)):
                import qcframework.qcfdb as qcfdb
                qdbh = qcfdb.QCFDB(config.getfull('submit_des_services'),
                                   config.getfull('submit_des_db_section'))

            print("\n\nChecking non-job block task status from task table in DB (%s is success)" % \
                  pfwdefs.PF_EXIT_SUCCESS)
            num_bltasks_failed = 0
            bltasks = {}
            blktid = None
            if ('block' in config['task_id']
                    and str(blknum) in config['task_id']['block']):
                blktid = int(config['task_id']['block'][str(blknum)])
                miscutils.fwdebug_print("Getting block task info from DB")
                start_time = time.time()
                bltasks = dbh.get_block_task_info(blktid)
                end_time = time.time()
                miscutils.fwdebug_print(
                    "Done getting block task info from DB (%s secs)" %
                    (end_time - start_time))
                for bltdict in list(bltasks.values()):
                    print("Block status = ", bltdict['status'])
                    if bltdict['status'] == pfwdefs.PF_EXIT_DRYRUN:
                        print("setting return value to dryrun")
                        retval = bltdict['status']
                    elif bltdict['status'] != pfwdefs.PF_EXIT_SUCCESS:
                        num_bltasks_failed += 1
                        msg2 += "\t%s" % (bltdict['name'])
                        if bltdict['label'] is not None:
                            msg2 += " - %s" % (bltdict['label'])
                        msg2 += " failed\n"

                        if bltdict['name'] == 'begblock':
                            # try to read the begblock.out and begblock.err files
                            print(
                                "Trying to get begblock.out and begblock.err")
                            msg2 += get_subblock_output("begblock")

                            # try to get QCF messages (especially from query codes)
                            begblock_tid = int(config['task_id']['begblock'])
                            sql = "select id from task where parent_task_id=%i and status!=0" % (
                                begblock_tid)
                            curs = dbh.cursor()
                            curs.execute(sql)
                            res = curs.fetchall()
                            msg2 += "\n===== QCF Messages =====\n"
                            msg2 += "\n begblock\n"
                            wrapids = [blktid, begblock_tid]
                            for r in res:
                                wrapids.append(r[0])

                            wrapmsg = {}
                            if qdbh is not None:
                                miscutils.fwdebug_print(
                                    "Querying QCF messages")
                                start_time = time.time()
                                wrapmsg = qdbh.get_qcf_messages_for_wrappers(
                                    wrapids)
                                end_time = time.time()
                                miscutils.fwdebug_print(
                                    "Done querying QCF messages (%s secs)" %
                                    (end_time - start_time))
                                miscutils.fwdebug_print("wrapmsg = %s" %
                                                        wrapmsg)
                            if len(wrapmsg) == 0:
                                msg2 += "    No QCF messages\n"
                            else:
                                for msgs in list(wrapmsg.values()):
                                    for m in msgs:
                                        msg2 += "    " + m['message'] + "\n"

                        retval = pfwdefs.PF_EXIT_FAILURE

                if retval != pfwdefs.PF_EXIT_DRYRUN:
                    print("\n\nChecking job status from pfw_job table in DB (%s is success)" % \
                        pfwdefs.PF_EXIT_SUCCESS)

                    miscutils.fwdebug_print("Getting job info from DB")
                    start_time = time.time()
                    jobinfo = dbh.get_job_info({'pfw_block_task_id': blktid})
                    end_time = time.time()
                    miscutils.fwdebug_print(
                        "Done getting job info from DB (%s secs)" %
                        (end_time - start_time))

                    miscutils.fwdebug_print("Getting wrapper info from DB")
                    start_time = time.time()
                    wrapinfo = dbh.get_wrapper_info(pfw_attempt_id=attid,
                                                    pfw_block_task_id=blktid)
                    end_time = time.time()
                    miscutils.fwdebug_print(
                        "Done getting wrapper info from DB (%s secs)" %
                        (end_time - start_time))
            else:
                msg = "Could not find task id for block %s in config.des" % blockname
                print("Error:", msg)
                if 'attempt' in config['task_id']:
                    miscutils.fwdebug_print("Saving pfw message")
                    start_time = time.time()
                    Messaging.pfw_message(dbh, attid,
                                          config['task_id']['attempt'], msg,
                                          pfw_utils.PFW_DB_INFO,
                                          'blockpost.out', 0)
                    end_time = time.time()
                    miscutils.fwdebug_print(
                        "Done saving pfw message (%s secs)" %
                        (end_time - start_time))
                print("all the task ids:", config['task_id'])

            archive = None
            if pfwdefs.HOME_ARCHIVE in config:
                archive = config.getfull(pfwdefs.HOME_ARCHIVE)
            logfullnames = dbh.get_fail_log_fullnames(attid, archive)
            dbh.close()
            print("len(jobinfo) = ", len(jobinfo))
            print("len(wrapinfo) = ", len(wrapinfo))
            job_byblk = pfwutils.index_job_info(jobinfo)
            print("blktid: ", blktid)
            print("job_byblk:", job_byblk)

            if blktid not in job_byblk:
                print("Warn: could not find jobs for block %s" % blknum)
                print("      This is ok if attempt died before jobs ran")
                print("      block task_ids in job_byblk:" %
                      list(job_byblk.keys()))
            else:
                wrap_byjob, wrap_bymod = pfwutils.index_wrapper_info(wrapinfo)
                #print "wrap_byjob:", wrap_byjob
                #print "wrap_bymod:", wrap_bymod
                for jobtid, jobdict in sorted(job_byblk[blktid].items()):
                    failedwraps[jobtid] = []
                    whyfailwraps[jobtid] = []

                    jobkeys = ""

                    # don't print out successful wrappers
                    if jobtid in wrap_byjob and jobdict[
                            'status'] == pfwdefs.PF_EXIT_SUCCESS:
                        continue

                    if jobdict['jobkeys'] is not None:
                        jobkeys = jobdict['jobkeys']
                        #print "jobkeys = ", jobkeys, type(jobkeys)

                    submit_job_path = "%s/B%02d-%s/%04d" % (
                        config.getfull('work_dir'),
                        int(config.getfull('blknum')),
                        config.getfull('blockname'), int(jobdict['jobnum']))
                    msg2 += "\n\t%s (%s) " % (pfwutils.pad_jobnum(
                        jobdict['jobnum']), jobkeys)

                    if jobtid not in wrap_byjob:
                        msg2 += "\tNo wrapper instances"
                    else:
                        #print "wrapnum in job =", wrap_byjob[jobtid].keys()
                        maxwrap = max(wrap_byjob[jobtid].keys())
                        #print "maxwrap =", maxwrap
                        modname = wrap_byjob[jobtid][maxwrap]['modname']
                        #print "modname =", modname

                        msg2 += "%d/%s  %s" % (len(
                            wrap_byjob[jobtid]), jobdict['expect_num_wrap'],
                                               modname)

                        # determine wrappers for this job without success exit
                        for wrapnum, wdict in list(wrap_byjob[jobtid].items()):
                            if wdict['status'] is None or wdict[
                                    'status'] != pfwdefs.PF_EXIT_SUCCESS:
                                if wdict['modname'] == modname:
                                    failedwraps[jobtid].append(wrapnum)
                                else:
                                    whyfailwraps[jobtid].append(wrapnum)

                    if jobdict['status'] == pfwdefs.PF_EXIT_EUPS_FAILURE:
                        msg2 += " - FAIL - EUPS setup failure"
                        retval = jobdict['status']
                    elif jobdict['status'] == pfwdefs.PF_EXIT_CONDOR:
                        msg2 += " - FAIL - Condor/Globus failure"
                        retval = jobdict['status']
                    elif jobdict['status'] is None:
                        msg2 += " - FAIL - NULL status"
                        retval = pfwdefs.PF_EXIT_FAILURE
                    elif jobdict['status'] != pfwdefs.PF_EXIT_SUCCESS:
                        msg2 += " - FAIL - Non-zero status"
                        retval = jobdict['status']

                    if jobdict['status'] != pfwdefs.PF_EXIT_SUCCESS:
                        msg2 += "\n\t\t%s/runjob.out " % (submit_job_path)

                    msg2 += '\n'

                    # print pfw_messages
                    if 'message' in jobdict:
                        print(jobdict['message'])
                        for msgdict in sorted(jobdict['message'],
                                              key=lambda k: k['message_time']):
                            level = int(msgdict['message_lvl'])
                            levelstr = 'info'
                            if level == pfwdefs.PFWDB_MSG_WARN:
                                levelstr = 'WARN'
                            elif level == pfwdefs.PFWDB_MSG_ERROR:
                                levelstr = 'ERROR'

                            msg2 += "\t\t%s - %s\n" % (
                                levelstr, msgdict['message'].replace(
                                    '\n', '\n\t\t\t'))

                    if jobtid in wrap_byjob:
                        # print log file name for failed/unfinished wrappers
                        for wrapnum in failedwraps[jobtid]:
                            wrapdict = wrap_byjob[jobtid][wrapnum]
                            if wrapdict['log'] in logfullnames:
                                msg2 += "\t\t%s - %s\n" % (
                                    wrapnum, logfullnames[wrapdict['log']])
                            else:
                                msg2 += "\t\t%s - Could not find log in archive (%s)\n" % (
                                    wrapnum, wrapdict['log'])
                            wrapmsg = get_qcf_messages(qdbh, config,
                                                       [wrapdict['task_id']])
                            msg2 = print_qcf_messages(config, wrapdict,
                                                      wrapmsg, msg2)

                        msg2 += '\n'

                        # If weirdness happened in run, print a message
                        if len(whyfailwraps[jobtid]) > 0:
                            msg2 += "\n*** Contact framework developers.   Wrappers ran after at least 1 wrapper from a previous module that doesn't have success status.\n"
                            msg2 += "\t%s\n" % ','.join(whyfailwraps[jobtid])

        except Exception as exc:
            if sem is not None:
                del sem
            msg2 += "\n\nEncountered error trying to gather status information for email."
            msg2 += "\nCheck output for blockpost for further details."
            print(
                "\n\nEncountered error trying to gather status information for email"
            )
            print("%s: %s" % (exc.__class__.__name__, str(exc)))
            (extype, exvalue, trback) = sys.exc_info()
            traceback.print_exception(extype, exvalue, trback, file=sys.stdout)
            retval = pfwdefs.PF_EXIT_FAILURE
    retval = int(retval) + verify_status
    print("before email retval =", retval)

    when_to_email = 'run'
    if 'when_to_email' in config:
        when_to_email = config.getfull('when_to_email').lower()

    if miscutils.convertBool(dryrun):
        if when_to_email != 'never':
            print("dryrun = ", dryrun)
            print("Sending dryrun email")
            if retval == pfwdefs.PF_EXIT_DRYRUN:
                msg1 = "%s:  In dryrun mode, block %s has finished successfully." % (
                    run, blockname)
            else:
                msg1 = "%s:  In dryrun mode, block %s has failed." % (
                    run, blockname)

            send_email(config, blockname, retval, "", msg1, msg2)
        else:
            print("Not sending dryrun email")
            print("retval = ", retval)
        retval = pfwdefs.PF_EXIT_DRYRUN
    elif retval:
        if when_to_email != 'never':
            print("Sending block failed email\n")
            msg1 = "%s:  block %s has failed." % (run, blockname)
            send_email(config, blockname, retval, "", msg1, msg2)
        else:
            print("Not sending failed email")
            print("retval = ", retval)
    elif retval == pfwdefs.PF_EXIT_SUCCESS:
        if when_to_email == 'block':
            msg1 = "%s:  block %s has finished successfully." % (run,
                                                                 blockname)
            msg2 = ""
            print("Sending success email\n")
            send_email(config, blockname, retval, "", msg1, msg2)
        elif when_to_email == 'run':
            numblocks = len(
                miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ','))
            if int(config[pfwdefs.PF_BLKNUM]) == numblocks:
                msg1 = "%s:  run has finished successfully." % (run)
                msg2 = ""
                print("Sending success email\n")
                send_email(config, blockname, retval, "", msg1, msg2)
            else:
                print("Not sending run email because not last block")
                print("retval = ", retval)
        else:
            print("Not sending success email")
            print("retval = ", retval)
    else:
        print("Not sending email")
        print("retval = ", retval)

    # Store values in DB and hist file
    dbh = None
    if miscutils.convertBool(config[pfwdefs.PF_USE_DB_OUT]):
        dbh = pfwdb.PFWDB(config.getfull('submit_des_services'),
                          config.getfull('submit_des_db_section'))
        if blktid is not None:
            print("Updating end of block task", blktid)
            dbh.end_task(blktid, retval, True)
        else:
            print("Could not update end of block task without block task id")
        if retval != pfwdefs.PF_EXIT_SUCCESS:
            print("Updating end of attempt", config['task_id']['attempt'])
            dbh.end_task(config['task_id']['attempt'], retval, True)
        dbh.commit()
        dbh.close()

    print("before next block retval = ", retval)
    if retval == pfwdefs.PF_EXIT_SUCCESS:
        # Get ready for next block
        config.inc_blknum()
        with open(configfile, 'w') as cfgfh:
            config.write(cfgfh)
        print("new blknum = ", config[pfwdefs.PF_BLKNUM])
        print("number of blocks = ",
              len(miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ',')))

    miscutils.fwdebug_print("Returning retval = %s (%s)" %
                            (retval, type(retval)))
    miscutils.fwdebug_print("END")
    debugfh.close()
    return int(retval)
Esempio n. 4
0
def jobpre(argv=None):
    """ Program entry point """
    if argv is None:
        argv = sys.argv

    #debugfh = tempfile.NamedTemporaryFile(prefix='jobpre_', dir='.', delete=False)
    default_log = f"jobpre_{random.randint(1,10000000):08d}.out"
    debugfh = open(default_log, 'w')

    tmpfn = debugfh.name
    outorig = sys.stdout
    errorig = sys.stderr
    sys.stdout = debugfh
    sys.stderr = debugfh

    print(' '.join(argv)) # command line for debugging
    print(os.getcwd())

    if len(argv) < 3:
        print("Usage: jobpre configfile jobnum")
        debugfh.close()
        return pfwdefs.PF_EXIT_FAILURE

    configfile = sys.argv[1]
    jobnum = sys.argv[2]    # could also be uberctrl

    # read wcl file
    config = pfwconfig.PfwConfig({'wclfile': configfile})
    blockname = config.getfull('blockname')
    blkdir = config.get('block_dir')
    tjpad = pfwutils.pad_jobnum(jobnum)

    # now that have more information, can rename output file
    miscutils.fwdebug_print("getting new_log_name")
    new_log_name = config.get_filename('job', {pfwdefs.PF_CURRVALS: {pfwdefs.PF_JOBNUM:jobnum,
                                                                     'flabel': 'jobpre',
                                                                     'fsuffix':'out'}})
    new_log_name = f"{blkdir}/{tjpad}/{new_log_name}"
    miscutils.fwdebug_print(f"new_log_name = {new_log_name}")

    debugfh.close()
    sys.stdout = outorig
    sys.stderr = errorig
    os.chmod(tmpfn, 0o666)
    os.rename(tmpfn, new_log_name)

    dbh = None
    if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
        if config.dbh is None:
            dbh = pfwdb.PFWDB(config.getfull('submit_des_services'),
                              config.getfull('submit_des_db_section'))
        else:
            dbh = config.dbh

    if 'use_qcf' in config and config['use_qcf']:
        debugfh = Messaging.Messaging(new_log_name, 'jobpre.py', config['pfw_attempt_id'], dbh=dbh, mode='a+', usedb=dbh is not None)
    else:
        debugfh = open(new_log_name, 'a+')

    sys.stdout = debugfh
    sys.stderr = debugfh

    if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
        ctstr = dbh.get_current_timestamp_str()
        dbh.update_job_info(config, tjpad, {'condor_submit_time': ctstr,
                                            'target_submit_time': ctstr})

    log_pfw_event(config, blockname, tjpad, 'j', ['pretask'])

    miscutils.fwdebug_print("jobpre done")
    debugfh.close()
    sys.stdout = outorig
    sys.stderr = errorig
    return pfwdefs.PF_EXIT_SUCCESS
Esempio n. 5
0
def jobpost(argv=None):
    """Performs steps needed after a pipeline job.
    """
    condor2db = {'jobid': 'condor_job_id',
                 'csubmittime': 'condor_submit_time',
                 'gsubmittime': 'target_submit_time',
                 'starttime': 'condor_start_time',
                 'endtime': 'condor_end_time'}

    if argv is None:
        argv = sys.argv

    debugfh = tempfile.NamedTemporaryFile(mode='w+', prefix='jobpost_', dir='.', delete=False)
    tmpfn = debugfh.name
    sys.stdout = debugfh
    sys.stderr = debugfh

    miscutils.fwdebug_print("temp log name = %s" % tmpfn)
    print('cmd>', ' '.join(argv))  # print command line for debugging

    if len(argv) < 7:
        # open file to catch error messages about command line
        print('Usage: jobpost.py configfile block jobnum inputtar outputtar retval')
        debugfh.close()
        return pfwdefs.PF_EXIT_FAILURE

    configfile = argv[1]
    blockname = argv[2]
    jobnum = argv[3]
    inputtar = argv[4]
    outputtar = argv[5]
    retval = pfwdefs.PF_EXIT_FAILURE
    if len(argv) == 7:
        retval = int(sys.argv[6])

    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print("configfile = %s" % configfile)
        miscutils.fwdebug_print("block = %s" % blockname)
        miscutils.fwdebug_print("jobnum = %s" % jobnum)
        miscutils.fwdebug_print("inputtar = %s" % inputtar)
        miscutils.fwdebug_print("outputtar = %s" % outputtar)
        miscutils.fwdebug_print("retval = %s" % retval)

    # read sysinfo file
    config = pfwconfig.PfwConfig({'wclfile': configfile})
    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print("done reading config file")

    # now that have more information, rename output file
    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print("before get_filename")
    blockname = config.getfull('blockname')
    blkdir = config.getfull('block_dir')
    tjpad = pfwutils.pad_jobnum(jobnum)

    os.chdir("%s/%s" % (blkdir, tjpad))
    new_log_name = config.get_filename('job', {pfwdefs.PF_CURRVALS: {pfwdefs.PF_JOBNUM: jobnum,
                                                                     'flabel': 'jobpost',
                                                                     'fsuffix': 'out'}})
    new_log_name = "%s" % (new_log_name)
    miscutils.fwdebug_print("new_log_name = %s" % new_log_name)

    debugfh.close()
    os.chmod(tmpfn, 0o666)
    os.rename(tmpfn, new_log_name)
    debugfh = open(new_log_name, 'a+')
    sys.stdout = debugfh
    sys.stderr = debugfh

    dbh = None
    if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
        dbh = pfwdb.PFWDB(config.getfull('submit_des_services'),
                          config.getfull('submit_des_db_section'))

        # get job information from the job stdout if exists
        (tjobinfo, tjobinfo_task) = parse_job_output(config, jobnum, dbh, retval)

        if dbh and len(tjobinfo) > 0:
            print("tjobinfo: ", tjobinfo)
            dbh.update_tjob_info(config['task_id']['job'][jobnum], tjobinfo)

        # get job information from the condor job log
        logfilename = 'runjob.log'
        if os.path.exists(logfilename) and os.path.getsize(logfilename) > 0:  # if made it to submitting/running jobs
            try:
                # update job info in DB from condor log
                print("Updating job info in DB from condor log")
                condorjobinfo = pfwcondor.parse_condor_user_log(logfilename)
                if len(list(condorjobinfo.keys())) > 1:
                    print("More than single job in job log")
                j = list(condorjobinfo.keys())[0]
                cjobinfo = condorjobinfo[j]
                djobinfo = {}
                for ckey, dkey in list(condor2db.items()):
                    if ckey in cjobinfo:
                        djobinfo[dkey] = cjobinfo[ckey]
                print(djobinfo)
                dbh.update_job_info(config, cjobinfo['jobname'], djobinfo)

                if 'holdreason' in cjobinfo and cjobinfo['holdreason'] is not None:
                    msg = "Condor HoldReason: %s" % cjobinfo['holdreason']
                    print(msg)
                    if dbh:
                        Messaging.pfw_message(dbh, config['pfw_attempt_id'],
                                              config['task_id']['job'][jobnum],
                                              msg, pfwdefs.PFWDB_MSG_WARN)

                if 'abortreason' in cjobinfo and cjobinfo['abortreason'] is not None:
                    tjobinfo_task['start_time'] = cjobinfo['starttime']
                    tjobinfo_task['end_time'] = cjobinfo['endtime']
                    if 'condor_rm' in cjobinfo['abortreason']:
                        tjobinfo_task['status'] = pfwdefs.PF_EXIT_OPDELETE
                    else:
                        tjobinfo_task['status'] = pfwdefs.PF_EXIT_CONDOR
                else:
                    pass
            except Exception:
                (extype, exvalue, trback) = sys.exc_info()
                traceback.print_exception(extype, exvalue, trback, file=sys.stdout)
        else:
            print("Warning:  no job condor log file")

        if dbh:
            # update job task
            if 'status' not in tjobinfo_task:
                tjobinfo_task['status'] = pfwdefs.PF_EXIT_CONDOR
            if 'end_time' not in tjobinfo_task:
                tjobinfo_task['end_time'] = datetime.now()
            wherevals = {'id': config['task_id']['job'][jobnum]}
            dbh.basic_update_row('task', tjobinfo_task, wherevals)
            dbh.commit()

    log_pfw_event(config, blockname, jobnum, 'j', ['posttask', retval])

    # input wcl should already exist in untar form
    if os.path.exists(inputtar):
        print("found inputtar: %s" % inputtar)
        os.unlink(inputtar)
    else:
        print("Could not find inputtar: %s" % inputtar)

    # untar output wcl tar and delete tar
    if os.path.exists(outputtar):
        print("Size of output wcl tar:", os.path.getsize(outputtar))
        if os.path.getsize(outputtar) > 0:
            print("found outputtar: %s" % outputtar)
            pfwutils.untar_dir(outputtar, '..')
            os.unlink(outputtar)
        else:
            msg = "Warn: outputwcl tarball (%s) is 0 bytes." % outputtar
            print(msg)
            if dbh:
                Messaging.pfw_message(dbh, config['pfw_attempt_id'],
                                      config['task_id']['job'][jobnum],
                                      msg, pfwdefs.PFWDB_MSG_WARN)
    else:
        msg = "Warn: outputwcl tarball (%s) does not exist." % outputtar
        print(msg)
        if dbh:
            Messaging.pfw_message(dbh, config['pfw_attempt_id'],
                                  config['task_id']['job'][jobnum],
                                  msg, pfwdefs.PFWDB_MSG_WARN)

    if retval != pfwdefs.PF_EXIT_SUCCESS:
        miscutils.fwdebug_print("Setting failure retval")
        retval = pfwdefs.PF_EXIT_FAILURE

    miscutils.fwdebug_print("Returning retval = %s" % retval)
    miscutils.fwdebug_print("jobpost done")
    debugfh.close()
    return int(retval)
Esempio n. 6
0
def print_single_wrap(wrapnum,
                      numwraps,
                      expnumwrap,
                      jdict,
                      jwdict,
                      wdict,
                      indent='\t'):
    """
    """
    state = "UNK"
    modname = "UNK"
    wrapkeys = ""

    jstate = "UNK"
    jstatus = "UNK"
    if jdict is None or jdict['start_time'] is None:
        jstate = "PRE"
        jstatus = None
    else:
        jstatus = jdict['status']
        if jdict['end_time'] is None:
            if numwraps == expnumwrap and jwdict['end_time'] is not None:
                jstate = "POST"
            else:
                jstate = "EXEC"
        elif jstatus == 0:
            jstate = "DONE"
        else:
            jstate = "FAIL"

    if jwdict is None:
        if jdict['end_time'] is None:
            state = "UNK"
            modname = "UNK"
            wrapkeys = ""
            status = "UNK - maybe first wrapper hasn't started yet"
        else:
            state = "UNK"
            modname = "UNK"
            wrapkeys = ""
            status = "UNK"
    elif jwdict['end_time'] is not None:
        status = jwdict['status']
        if status == 0:
            state = "DONE"
        else:
            state = "FAIL"
        modname = wdict['modname']
        wrapkeys = wdict['wrapkeys']
    elif wdict is None:
        state = "PRE"
        if jwdict['status'] is None:
            status = jdict['status']
        else:
            status = jwdict['status']
    elif wdict['end_time'] is not None and jwdict['end_time'] is None:
        state = "POST"  # after wrapper, but still in job_wrapper
        status = wdict['status']
        modname = wdict['modname']
        wrapkeys = wdict['wrapkeys']
    elif wdict['end_time'] is None and wdict['start_time'] is not None:
        state = "EXEC"
        status = ""
        modname = wdict['modname']
        wrapkeys = wdict['wrapkeys']
    else:
        print("Didn't fit conditions:")
        print(jwdict)
        print(wdict)

    print("%sjob: %s (jk=%s)  %d/%d  %s - %s   wrap: %s %s (wk=%s) - %s %s" % \
          (indent, pfwutils.pad_jobnum(jdict['jobnum']), jdict['jobkeys'],
           numwraps, expnumwrap, jstate, jstatus,
           wrapnum, modname, wrapkeys,
           state, status))
Esempio n. 7
0
def jobpre(argv=None):
    """Program entry point.
    """
    if argv is None:
        argv = sys.argv

    debugfh = tempfile.NamedTemporaryFile(mode='w+',
                                          prefix='jobpre_',
                                          dir='.',
                                          delete=False)
    tmpfn = debugfh.name
    sys.stdout = debugfh
    sys.stderr = debugfh

    print(' '.join(sys.argv))  # command line for debugging
    print(os.getcwd())

    if len(argv) < 3:
        print('Usage: jobpre configfile jobnum')
        debugfh.close()
        return pfwdefs.PF_EXIT_FAILURE

    configfile = sys.argv[1]
    jobnum = sys.argv[2]  # could also be uberctrl

    # read wcl file
    config = pfwconfig.PfwConfig({'wclfile': configfile})
    blockname = config.getfull('blockname')
    blkdir = config.get('block_dir')
    tjpad = pfwutils.pad_jobnum(jobnum)

    # now that have more information, can rename output file
    miscutils.fwdebug_print("getting new_log_name")
    new_log_name = config.get_filename(
        'job', {
            pfwdefs.PF_CURRVALS: {
                pfwdefs.PF_JOBNUM: jobnum,
                'flabel': 'jobpre',
                'fsuffix': 'out'
            }
        })
    new_log_name = "%s/%s/%s" % (blkdir, tjpad, new_log_name)
    miscutils.fwdebug_print("new_log_name = %s" % new_log_name)

    debugfh.close()
    os.chmod(tmpfn, 0o666)
    os.rename(tmpfn, new_log_name)
    debugfh = open(new_log_name, 'a+')
    sys.stdout = debugfh
    sys.stderr = debugfh

    if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
        dbh = pfwdb.PFWDB(config.getfull('submit_des_services'),
                          config.getfull('submit_des_db_section'))
        ctstr = dbh.get_current_timestamp_str()
        dbh.update_job_info(config, tjpad, {
            'condor_submit_time': ctstr,
            'target_submit_time': ctstr
        })

    log_pfw_event(config, blockname, tjpad, 'j', ['pretask'])

    miscutils.fwdebug_print("jobpre done")
    debugfh.close()
    return pfwdefs.PF_EXIT_SUCCESS