Esempio n. 1
0
def main(argv=None):
    """Program entry point.
    """
    if argv is None:
        argv = sys.argv

    if len(argv) != 3:
        miscutils.fwdie("Usage: runqueries.pl configfile condorjobid\n", pfwdefs.PF_EXIT_FAILURE)

    configfile = argv[1]
    condorid = argv[2]

    config = pfwconfig.PfwConfig({'wclfile': configfile})
    # log condor jobid
    log_pfw_event(config, config['curr_block'], 'runqueries', 'j', ['cid', condorid])

    if pfwdefs.SW_MODULELIST not in config:
        miscutils.fwdie("Error:  No modules to run.", pfwdefs.PF_EXIT_FAILURE)

    ### Get master lists and files calling external codes when needed

    modulelist = miscutils.fwsplit(config[pfwdefs.SW_MODULELIST].lower())

    modules_prev_in_list = {}
    for modname in modulelist:
        if modname not in config[pfwdefs.SW_MODULESECT]:
            miscutils.fwdie("Error: Could not find module description for module %s\n" %
                            (modname), pfwdefs.PF_EXIT_FAILURE)
        runqueries(config, configfile, modname, modules_prev_in_list)
        modules_prev_in_list[modname] = True

    return 0
Esempio n. 2
0
def submit_main_dag(config, dagfile, logfh):
    """ Submit main DAG file to Condor"""
    (exitcode, outtuple) = pfwcondor.condor_submit('%s.condor.sub' % (dagfile))
    if exitcode or re.search('ERROR', outtuple[0]):
        sys.stderr.write('\n%s\n' % (outtuple[0]))

        logfh.write('\ncondor_submit %s.condor.sub\n%s\n' %
                    (dagfile, outtuple[0]))
    else:
        print '\nImage processing successfully submitted to condor:'
        print '\tRun = %s' % (config.getfull('submit_run'))
        print "\tpfw_attempt_id = %s" % (config['pfw_attempt_id'])
        print "\tpfw_attempt task_id = %s" % (config['task_id']['attempt'])
    print '\n'

    # for completeness, log condorid of pipeline manager
    dagjob = pfwcondor.parse_condor_user_log('%s/%s.dagman.log' % \
                                             (config.getfull('uberctrl_dir'), dagfile))
    jobids = dagjob.keys()
    condorid = None
    if len(jobids) == 1:
        condorid = int(jobids[0])
    pfwlog.log_pfw_event(config, 'analysis', 'j', 'mngr', 'pretask')
    pfwlog.log_pfw_event(config, 'analysis', 'j', 'mngr', {'cid': condorid})

    return condorid
Esempio n. 3
0
def submit_main_dag(config, dagfile, logfh):
    """ Submit main DAG file to Condor"""
    (exitcode, outtuple) = pfwcondor.condor_submit(f"{dagfile}.condor.sub")
    if exitcode or re.search('ERROR', outtuple[0]):
        sys.stderr.write(f"\n{outtuple[0]}\n")

        logfh.write(f"\ncondor_submit {dagfile}.condor.sub\n{outtuple[0]}\n")
        logfh.flush()
    else:
        print('\nImage processing successfully submitted to condor:')
        print(f"\tRun = {config.getfull('submit_run')}")
        print(f"\tpfw_attempt_id = {config['pfw_attempt_id']}")
        print(f"\tpfw_attempt task_id = {config['task_id']['attempt']}")
    print('\n')

    # for completeness, log condorid of pipeline manager
    dagjob = pfwcondor.parse_condor_user_log(
        f"{config.getfull('uberctrl_dir')}/{dagfile}.dagman.log")
    jobids = list(dagjob.keys())
    condorid = None
    if len(jobids) == 1:
        condorid = int(jobids[0])
    pfwlog.log_pfw_event(config, 'analysis', 'j', 'mngr', 'pretask')
    pfwlog.log_pfw_event(config, 'analysis', 'j', 'mngr', {'cid': condorid})

    return condorid
Esempio n. 4
0
def logpre(argv=None):
    """Program entry point.
    """
    if argv is None:
        argv = sys.argv

    default_log = 'logpre.out'
    debugfh = open(default_log, 'w')
    sys.stdout = debugfh
    sys.stderr = debugfh

    print(' '.join(sys.argv))  # command line for debugging

    if len(argv) < 5:
        print('Usage: logpre configfile block subblocktype subblock')
        debugfh.close()
        return pfwdefs.PF_EXIT_FAILURE

    configfile = sys.argv[1]
    blockname = sys.argv[2]  # could also be uberctrl
    subblocktype = sys.argv[3]
    subblock = sys.argv[4]

    # read sysinfo file
    config = pfwconfig.PfwConfig({'wclfile': configfile})

    # now that have more information, can rename output file
    miscutils.fwdebug_print("getting new_log_name")
    blockname = config.getfull('blockname')
    blkdir = config.getfull('block_dir')
    new_log_name = config.get_filename(
        'block', {
            pfwdefs.PF_CURRVALS: {
                'subblock': subblock,
                'flabel': '${subblock}_logpre',
                'fsuffix': 'out'
            }
        })
    new_log_name = "%s/%s" % (blkdir, new_log_name)
    miscutils.fwdebug_print("new_log_name = %s" % new_log_name)
    debugfh.close()

    os.chmod(default_log, 0o666)
    os.rename(default_log, new_log_name)

    debugfh = open(new_log_name, 'a+')
    sys.stdout = debugfh
    sys.stderr = debugfh

    log_pfw_event(config, blockname, subblock, subblocktype, ['pretask'])

    print("logpre done")
    debugfh.close()
    return pfwdefs.PF_EXIT_SUCCESS
Esempio n. 5
0
def blockpost(argv=None):
    """Program entry point.
    """
    if argv is None:
        argv = sys.argv

    # open file to catch error messages about command line
    debugfh = open('blockpost.out', 'w')
    sys.stdout = debugfh
    sys.stderr = debugfh

    print(' '.join(argv))  # print command line for debugging

    print("running on %s" % (socket.gethostname()))

    if len(argv) != 3:
        print('Usage: blockpost.py configfile retval')
        debugfh.close()
        return pfwdefs.PF_EXIT_FAILURE

    configfile = argv[1]
    retval = int(argv[2])

    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print("configfile = %s" % configfile)
    miscutils.fwdebug_print("retval = %s" % retval)

    # read sysinfo file
    config = pfwconfig.PfwConfig({'wclfile': configfile})
    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print("done reading config file")
    blockname = config.getfull('blockname')
    blkdir = config.getfull('block_dir')

    # now that have more information, can rename output file
    miscutils.fwdebug_print("getting new_log_name")
    new_log_name = config.get_filename(
        'block',
        {pfwdefs.PF_CURRVALS: {
            'flabel': 'blockpost',
            'fsuffix': 'out'
        }})
    new_log_name = "%s/%s" % (blkdir, new_log_name)
    miscutils.fwdebug_print("new_log_name = %s" % new_log_name)

    debugfh.close()
    os.chmod('blockpost.out', 0o666)
    os.rename('blockpost.out', new_log_name)
    debugfh = open(new_log_name, 'a+')
    sys.stdout = debugfh
    sys.stderr = debugfh

    os.chdir(blkdir)

    log_pfw_event(config, blockname, 'blockpost', 'j', ['posttask', retval])

    dryrun = config.getfull(pfwdefs.PF_DRYRUN)
    run = config.getfull('run')
    attid = config['pfw_attempt_id']
    reqnum = config.getfull(pfwdefs.REQNUM)
    unitname = config.getfull(pfwdefs.UNITNAME)
    attnum = config.getfull(pfwdefs.ATTNUM)
    blknum = int(config.getfull(pfwdefs.PF_BLKNUM))
    blktid = None

    msg2 = ""
    dbh = None
    job_byblk = {}
    wrap_byjob = {}
    wrap_bymod = {}
    wrapinfo = {}
    jobinfo = {}
    failedwraps = {}
    whyfailwraps = {}  # mod failures for other modname, shouldn't happen
    usedb = miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT))
    verify_files = miscutils.convertBool(config.getfull('verify_files'))
    verify_status = 0
    if verify_files and not usedb:
        print('Skipping file verification due to lack of database connection')
    if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
        sem = None
        try:
            miscutils.fwdebug_print("Connecting to DB")
            dbh = pfwdb.PFWDB(config.getfull('submit_des_services'),
                              config.getfull('submit_des_db_section'))
            if verify_files:
                curs = dbh.cursor()
                curs.execute("select root from ops_archive where name='%s'" %
                             (config.getfull('home_archive')))
                rows = curs.fetchall()
                if rows is None or len(rows) != 1:
                    raise Exception(
                        "Invalid archive name (%s).   Found %s rows in ops_archive"
                        % (config.getfull('home_archive'), len(rows)))
                root = rows[0][0]
                if not os.path.isdir(root):
                    print(
                        "Cannot read archive root directory:%s This program must be run on an NCSA machine with access to the archive storage system."
                        % (config.getfull('home_archive')))
                sem = dbsem.DBSemaphore(
                    'verify_files_10', None,
                    config.getfull('submit_des_services'),
                    config.getfull('submit_des_db_section'))
                print(
                    "\n\nVerifying archive file sizes on disk (0 is success)")
                verify_status = cu.compare(
                    dbh=dbh,
                    archive=config.getfull('home_archive'),
                    pfwid=attid,
                    filesize=True,
                    md5sum=False,
                    quick=True,
                    debug=False,
                    script=False,
                    verbose=False,
                    silent=True)
                if sem is not None:
                    del sem
                print("  Verification of files returned status %i" %
                      (verify_status))
                if verify_status != 0:
                    print(
                        "  This indicates that one or more files do not have the correct file size (based on DB entries). Run"
                    )
                    print(
                        "\n    compare_db.py --des_services %s --section %s --archive %s --pfwid %i --filesize --verbose"
                        % (config.getfull('submit_des_services'),
                           config.getfull('submit_des_db_section'),
                           config.getfull('home_archive'), int(attid)))
                    print("\n  to see the details.")

            if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_QCF)):
                import qcframework.qcfdb as qcfdb
                qdbh = qcfdb.QCFDB(config.getfull('submit_des_services'),
                                   config.getfull('submit_des_db_section'))

            print("\n\nChecking non-job block task status from task table in DB (%s is success)" % \
                  pfwdefs.PF_EXIT_SUCCESS)
            num_bltasks_failed = 0
            bltasks = {}
            blktid = None
            if ('block' in config['task_id']
                    and str(blknum) in config['task_id']['block']):
                blktid = int(config['task_id']['block'][str(blknum)])
                miscutils.fwdebug_print("Getting block task info from DB")
                start_time = time.time()
                bltasks = dbh.get_block_task_info(blktid)
                end_time = time.time()
                miscutils.fwdebug_print(
                    "Done getting block task info from DB (%s secs)" %
                    (end_time - start_time))
                for bltdict in list(bltasks.values()):
                    print("Block status = ", bltdict['status'])
                    if bltdict['status'] == pfwdefs.PF_EXIT_DRYRUN:
                        print("setting return value to dryrun")
                        retval = bltdict['status']
                    elif bltdict['status'] != pfwdefs.PF_EXIT_SUCCESS:
                        num_bltasks_failed += 1
                        msg2 += "\t%s" % (bltdict['name'])
                        if bltdict['label'] is not None:
                            msg2 += " - %s" % (bltdict['label'])
                        msg2 += " failed\n"

                        if bltdict['name'] == 'begblock':
                            # try to read the begblock.out and begblock.err files
                            print(
                                "Trying to get begblock.out and begblock.err")
                            msg2 += get_subblock_output("begblock")

                            # try to get QCF messages (especially from query codes)
                            begblock_tid = int(config['task_id']['begblock'])
                            sql = "select id from task where parent_task_id=%i and status!=0" % (
                                begblock_tid)
                            curs = dbh.cursor()
                            curs.execute(sql)
                            res = curs.fetchall()
                            msg2 += "\n===== QCF Messages =====\n"
                            msg2 += "\n begblock\n"
                            wrapids = [blktid, begblock_tid]
                            for r in res:
                                wrapids.append(r[0])

                            wrapmsg = {}
                            if qdbh is not None:
                                miscutils.fwdebug_print(
                                    "Querying QCF messages")
                                start_time = time.time()
                                wrapmsg = qdbh.get_qcf_messages_for_wrappers(
                                    wrapids)
                                end_time = time.time()
                                miscutils.fwdebug_print(
                                    "Done querying QCF messages (%s secs)" %
                                    (end_time - start_time))
                                miscutils.fwdebug_print("wrapmsg = %s" %
                                                        wrapmsg)
                            if len(wrapmsg) == 0:
                                msg2 += "    No QCF messages\n"
                            else:
                                for msgs in list(wrapmsg.values()):
                                    for m in msgs:
                                        msg2 += "    " + m['message'] + "\n"

                        retval = pfwdefs.PF_EXIT_FAILURE

                if retval != pfwdefs.PF_EXIT_DRYRUN:
                    print("\n\nChecking job status from pfw_job table in DB (%s is success)" % \
                        pfwdefs.PF_EXIT_SUCCESS)

                    miscutils.fwdebug_print("Getting job info from DB")
                    start_time = time.time()
                    jobinfo = dbh.get_job_info({'pfw_block_task_id': blktid})
                    end_time = time.time()
                    miscutils.fwdebug_print(
                        "Done getting job info from DB (%s secs)" %
                        (end_time - start_time))

                    miscutils.fwdebug_print("Getting wrapper info from DB")
                    start_time = time.time()
                    wrapinfo = dbh.get_wrapper_info(pfw_attempt_id=attid,
                                                    pfw_block_task_id=blktid)
                    end_time = time.time()
                    miscutils.fwdebug_print(
                        "Done getting wrapper info from DB (%s secs)" %
                        (end_time - start_time))
            else:
                msg = "Could not find task id for block %s in config.des" % blockname
                print("Error:", msg)
                if 'attempt' in config['task_id']:
                    miscutils.fwdebug_print("Saving pfw message")
                    start_time = time.time()
                    Messaging.pfw_message(dbh, attid,
                                          config['task_id']['attempt'], msg,
                                          pfw_utils.PFW_DB_INFO,
                                          'blockpost.out', 0)
                    end_time = time.time()
                    miscutils.fwdebug_print(
                        "Done saving pfw message (%s secs)" %
                        (end_time - start_time))
                print("all the task ids:", config['task_id'])

            archive = None
            if pfwdefs.HOME_ARCHIVE in config:
                archive = config.getfull(pfwdefs.HOME_ARCHIVE)
            logfullnames = dbh.get_fail_log_fullnames(attid, archive)
            dbh.close()
            print("len(jobinfo) = ", len(jobinfo))
            print("len(wrapinfo) = ", len(wrapinfo))
            job_byblk = pfwutils.index_job_info(jobinfo)
            print("blktid: ", blktid)
            print("job_byblk:", job_byblk)

            if blktid not in job_byblk:
                print("Warn: could not find jobs for block %s" % blknum)
                print("      This is ok if attempt died before jobs ran")
                print("      block task_ids in job_byblk:" %
                      list(job_byblk.keys()))
            else:
                wrap_byjob, wrap_bymod = pfwutils.index_wrapper_info(wrapinfo)
                #print "wrap_byjob:", wrap_byjob
                #print "wrap_bymod:", wrap_bymod
                for jobtid, jobdict in sorted(job_byblk[blktid].items()):
                    failedwraps[jobtid] = []
                    whyfailwraps[jobtid] = []

                    jobkeys = ""

                    # don't print out successful wrappers
                    if jobtid in wrap_byjob and jobdict[
                            'status'] == pfwdefs.PF_EXIT_SUCCESS:
                        continue

                    if jobdict['jobkeys'] is not None:
                        jobkeys = jobdict['jobkeys']
                        #print "jobkeys = ", jobkeys, type(jobkeys)

                    submit_job_path = "%s/B%02d-%s/%04d" % (
                        config.getfull('work_dir'),
                        int(config.getfull('blknum')),
                        config.getfull('blockname'), int(jobdict['jobnum']))
                    msg2 += "\n\t%s (%s) " % (pfwutils.pad_jobnum(
                        jobdict['jobnum']), jobkeys)

                    if jobtid not in wrap_byjob:
                        msg2 += "\tNo wrapper instances"
                    else:
                        #print "wrapnum in job =", wrap_byjob[jobtid].keys()
                        maxwrap = max(wrap_byjob[jobtid].keys())
                        #print "maxwrap =", maxwrap
                        modname = wrap_byjob[jobtid][maxwrap]['modname']
                        #print "modname =", modname

                        msg2 += "%d/%s  %s" % (len(
                            wrap_byjob[jobtid]), jobdict['expect_num_wrap'],
                                               modname)

                        # determine wrappers for this job without success exit
                        for wrapnum, wdict in list(wrap_byjob[jobtid].items()):
                            if wdict['status'] is None or wdict[
                                    'status'] != pfwdefs.PF_EXIT_SUCCESS:
                                if wdict['modname'] == modname:
                                    failedwraps[jobtid].append(wrapnum)
                                else:
                                    whyfailwraps[jobtid].append(wrapnum)

                    if jobdict['status'] == pfwdefs.PF_EXIT_EUPS_FAILURE:
                        msg2 += " - FAIL - EUPS setup failure"
                        retval = jobdict['status']
                    elif jobdict['status'] == pfwdefs.PF_EXIT_CONDOR:
                        msg2 += " - FAIL - Condor/Globus failure"
                        retval = jobdict['status']
                    elif jobdict['status'] is None:
                        msg2 += " - FAIL - NULL status"
                        retval = pfwdefs.PF_EXIT_FAILURE
                    elif jobdict['status'] != pfwdefs.PF_EXIT_SUCCESS:
                        msg2 += " - FAIL - Non-zero status"
                        retval = jobdict['status']

                    if jobdict['status'] != pfwdefs.PF_EXIT_SUCCESS:
                        msg2 += "\n\t\t%s/runjob.out " % (submit_job_path)

                    msg2 += '\n'

                    # print pfw_messages
                    if 'message' in jobdict:
                        print(jobdict['message'])
                        for msgdict in sorted(jobdict['message'],
                                              key=lambda k: k['message_time']):
                            level = int(msgdict['message_lvl'])
                            levelstr = 'info'
                            if level == pfwdefs.PFWDB_MSG_WARN:
                                levelstr = 'WARN'
                            elif level == pfwdefs.PFWDB_MSG_ERROR:
                                levelstr = 'ERROR'

                            msg2 += "\t\t%s - %s\n" % (
                                levelstr, msgdict['message'].replace(
                                    '\n', '\n\t\t\t'))

                    if jobtid in wrap_byjob:
                        # print log file name for failed/unfinished wrappers
                        for wrapnum in failedwraps[jobtid]:
                            wrapdict = wrap_byjob[jobtid][wrapnum]
                            if wrapdict['log'] in logfullnames:
                                msg2 += "\t\t%s - %s\n" % (
                                    wrapnum, logfullnames[wrapdict['log']])
                            else:
                                msg2 += "\t\t%s - Could not find log in archive (%s)\n" % (
                                    wrapnum, wrapdict['log'])
                            wrapmsg = get_qcf_messages(qdbh, config,
                                                       [wrapdict['task_id']])
                            msg2 = print_qcf_messages(config, wrapdict,
                                                      wrapmsg, msg2)

                        msg2 += '\n'

                        # If weirdness happened in run, print a message
                        if len(whyfailwraps[jobtid]) > 0:
                            msg2 += "\n*** Contact framework developers.   Wrappers ran after at least 1 wrapper from a previous module that doesn't have success status.\n"
                            msg2 += "\t%s\n" % ','.join(whyfailwraps[jobtid])

        except Exception as exc:
            if sem is not None:
                del sem
            msg2 += "\n\nEncountered error trying to gather status information for email."
            msg2 += "\nCheck output for blockpost for further details."
            print(
                "\n\nEncountered error trying to gather status information for email"
            )
            print("%s: %s" % (exc.__class__.__name__, str(exc)))
            (extype, exvalue, trback) = sys.exc_info()
            traceback.print_exception(extype, exvalue, trback, file=sys.stdout)
            retval = pfwdefs.PF_EXIT_FAILURE
    retval = int(retval) + verify_status
    print("before email retval =", retval)

    when_to_email = 'run'
    if 'when_to_email' in config:
        when_to_email = config.getfull('when_to_email').lower()

    if miscutils.convertBool(dryrun):
        if when_to_email != 'never':
            print("dryrun = ", dryrun)
            print("Sending dryrun email")
            if retval == pfwdefs.PF_EXIT_DRYRUN:
                msg1 = "%s:  In dryrun mode, block %s has finished successfully." % (
                    run, blockname)
            else:
                msg1 = "%s:  In dryrun mode, block %s has failed." % (
                    run, blockname)

            send_email(config, blockname, retval, "", msg1, msg2)
        else:
            print("Not sending dryrun email")
            print("retval = ", retval)
        retval = pfwdefs.PF_EXIT_DRYRUN
    elif retval:
        if when_to_email != 'never':
            print("Sending block failed email\n")
            msg1 = "%s:  block %s has failed." % (run, blockname)
            send_email(config, blockname, retval, "", msg1, msg2)
        else:
            print("Not sending failed email")
            print("retval = ", retval)
    elif retval == pfwdefs.PF_EXIT_SUCCESS:
        if when_to_email == 'block':
            msg1 = "%s:  block %s has finished successfully." % (run,
                                                                 blockname)
            msg2 = ""
            print("Sending success email\n")
            send_email(config, blockname, retval, "", msg1, msg2)
        elif when_to_email == 'run':
            numblocks = len(
                miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ','))
            if int(config[pfwdefs.PF_BLKNUM]) == numblocks:
                msg1 = "%s:  run has finished successfully." % (run)
                msg2 = ""
                print("Sending success email\n")
                send_email(config, blockname, retval, "", msg1, msg2)
            else:
                print("Not sending run email because not last block")
                print("retval = ", retval)
        else:
            print("Not sending success email")
            print("retval = ", retval)
    else:
        print("Not sending email")
        print("retval = ", retval)

    # Store values in DB and hist file
    dbh = None
    if miscutils.convertBool(config[pfwdefs.PF_USE_DB_OUT]):
        dbh = pfwdb.PFWDB(config.getfull('submit_des_services'),
                          config.getfull('submit_des_db_section'))
        if blktid is not None:
            print("Updating end of block task", blktid)
            dbh.end_task(blktid, retval, True)
        else:
            print("Could not update end of block task without block task id")
        if retval != pfwdefs.PF_EXIT_SUCCESS:
            print("Updating end of attempt", config['task_id']['attempt'])
            dbh.end_task(config['task_id']['attempt'], retval, True)
        dbh.commit()
        dbh.close()

    print("before next block retval = ", retval)
    if retval == pfwdefs.PF_EXIT_SUCCESS:
        # Get ready for next block
        config.inc_blknum()
        with open(configfile, 'w') as cfgfh:
            config.write(cfgfh)
        print("new blknum = ", config[pfwdefs.PF_BLKNUM])
        print("number of blocks = ",
              len(miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ',')))

    miscutils.fwdebug_print("Returning retval = %s (%s)" %
                            (retval, type(retval)))
    miscutils.fwdebug_print("END")
    debugfh.close()
    return int(retval)
Esempio n. 6
0
def summary(argv=None):
    """ Create and send summary email """
    if argv is None:
        argv = sys.argv

    debugfh = open('summary.out', 'w')
    sys.stdout = debugfh
    sys.stderr = debugfh

    print(' '.join(argv))

    if len(argv) < 2:
        print("Usage: summary configfile status")
        debugfh.close()
        return pfwdefs.PF_EXIT_FAILURE

    if len(argv) == 3:
        status = argv[2]
        # dagman always exits with 0 or 1
        if status == 1:
            status = pfwdefs.PF_EXIT_FAILURE
    else:
        print("summary: Missing status value")
        status = None

    # read sysinfo file
    config = pfwconfig.PfwConfig({'wclfile': argv[1]})

    log_pfw_event(config, 'process', 'mngr', 'j', ['posttask', status])

    msgstr = ""

    msg1 = ""
    subject = ""
    if not status:
        msg1 = f"Processing finished with unknown results.\n{msgstr}"
    elif pfwdefs.PF_DRYRUN in config and miscutils.convertBool(
            config.getfull(pfwdefs.PF_DRYRUN)):
        msg1 = f"Processing ended after DRYRUN\n{msgstr}"

        if int(status) == pfwdefs.PF_EXIT_SUCCESS:
            msg1 = "Processing has successfully completed.\n"
            subject = ""
        else:
            print(f"status = '{status}'")
            print("type(status) =", type(status))
            print(f"SUCCESS = '{pfwdefs.PF_EXIT_SUCCESS}'")
            print("type(SUCCESS) =", type(pfwdefs.PF_EXIT_SUCCESS))
            msg1 = f"Processing aborted with status {status}.\n"

    subject = ""
    pfwemail.send_email(config, "processing", status, subject, msg1, '')

    if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
        dbh = pfwdb.PFWDB(config.getfull('submit_des_services'),
                          config.getfull('submit_des_db_section'))
        dbh.update_attempt_end_vals(config['pfw_attempt_id'], status)
    print(f"summary: status = '{status}'")
    print("summary:", msg1)
    print("summary: End")
    debugfh.close()
    return status
Esempio n. 7
0
def jobpre(argv=None):
    """ Program entry point """
    if argv is None:
        argv = sys.argv

    #debugfh = tempfile.NamedTemporaryFile(prefix='jobpre_', dir='.', delete=False)
    default_log = f"jobpre_{random.randint(1,10000000):08d}.out"
    debugfh = open(default_log, 'w')

    tmpfn = debugfh.name
    outorig = sys.stdout
    errorig = sys.stderr
    sys.stdout = debugfh
    sys.stderr = debugfh

    print(' '.join(argv)) # command line for debugging
    print(os.getcwd())

    if len(argv) < 3:
        print("Usage: jobpre configfile jobnum")
        debugfh.close()
        return pfwdefs.PF_EXIT_FAILURE

    configfile = sys.argv[1]
    jobnum = sys.argv[2]    # could also be uberctrl

    # read wcl file
    config = pfwconfig.PfwConfig({'wclfile': configfile})
    blockname = config.getfull('blockname')
    blkdir = config.get('block_dir')
    tjpad = pfwutils.pad_jobnum(jobnum)

    # now that have more information, can rename output file
    miscutils.fwdebug_print("getting new_log_name")
    new_log_name = config.get_filename('job', {pfwdefs.PF_CURRVALS: {pfwdefs.PF_JOBNUM:jobnum,
                                                                     'flabel': 'jobpre',
                                                                     'fsuffix':'out'}})
    new_log_name = f"{blkdir}/{tjpad}/{new_log_name}"
    miscutils.fwdebug_print(f"new_log_name = {new_log_name}")

    debugfh.close()
    sys.stdout = outorig
    sys.stderr = errorig
    os.chmod(tmpfn, 0o666)
    os.rename(tmpfn, new_log_name)

    dbh = None
    if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
        if config.dbh is None:
            dbh = pfwdb.PFWDB(config.getfull('submit_des_services'),
                              config.getfull('submit_des_db_section'))
        else:
            dbh = config.dbh

    if 'use_qcf' in config and config['use_qcf']:
        debugfh = Messaging.Messaging(new_log_name, 'jobpre.py', config['pfw_attempt_id'], dbh=dbh, mode='a+', usedb=dbh is not None)
    else:
        debugfh = open(new_log_name, 'a+')

    sys.stdout = debugfh
    sys.stderr = debugfh

    if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
        ctstr = dbh.get_current_timestamp_str()
        dbh.update_job_info(config, tjpad, {'condor_submit_time': ctstr,
                                            'target_submit_time': ctstr})

    log_pfw_event(config, blockname, tjpad, 'j', ['pretask'])

    miscutils.fwdebug_print("jobpre done")
    debugfh.close()
    sys.stdout = outorig
    sys.stderr = errorig
    return pfwdefs.PF_EXIT_SUCCESS
Esempio n. 8
0
def jobpost(argv=None):
    """Performs steps needed after a pipeline job.
    """
    condor2db = {'jobid': 'condor_job_id',
                 'csubmittime': 'condor_submit_time',
                 'gsubmittime': 'target_submit_time',
                 'starttime': 'condor_start_time',
                 'endtime': 'condor_end_time'}

    if argv is None:
        argv = sys.argv

    debugfh = tempfile.NamedTemporaryFile(mode='w+', prefix='jobpost_', dir='.', delete=False)
    tmpfn = debugfh.name
    sys.stdout = debugfh
    sys.stderr = debugfh

    miscutils.fwdebug_print("temp log name = %s" % tmpfn)
    print('cmd>', ' '.join(argv))  # print command line for debugging

    if len(argv) < 7:
        # open file to catch error messages about command line
        print('Usage: jobpost.py configfile block jobnum inputtar outputtar retval')
        debugfh.close()
        return pfwdefs.PF_EXIT_FAILURE

    configfile = argv[1]
    blockname = argv[2]
    jobnum = argv[3]
    inputtar = argv[4]
    outputtar = argv[5]
    retval = pfwdefs.PF_EXIT_FAILURE
    if len(argv) == 7:
        retval = int(sys.argv[6])

    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print("configfile = %s" % configfile)
        miscutils.fwdebug_print("block = %s" % blockname)
        miscutils.fwdebug_print("jobnum = %s" % jobnum)
        miscutils.fwdebug_print("inputtar = %s" % inputtar)
        miscutils.fwdebug_print("outputtar = %s" % outputtar)
        miscutils.fwdebug_print("retval = %s" % retval)

    # read sysinfo file
    config = pfwconfig.PfwConfig({'wclfile': configfile})
    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print("done reading config file")

    # now that have more information, rename output file
    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print("before get_filename")
    blockname = config.getfull('blockname')
    blkdir = config.getfull('block_dir')
    tjpad = pfwutils.pad_jobnum(jobnum)

    os.chdir("%s/%s" % (blkdir, tjpad))
    new_log_name = config.get_filename('job', {pfwdefs.PF_CURRVALS: {pfwdefs.PF_JOBNUM: jobnum,
                                                                     'flabel': 'jobpost',
                                                                     'fsuffix': 'out'}})
    new_log_name = "%s" % (new_log_name)
    miscutils.fwdebug_print("new_log_name = %s" % new_log_name)

    debugfh.close()
    os.chmod(tmpfn, 0o666)
    os.rename(tmpfn, new_log_name)
    debugfh = open(new_log_name, 'a+')
    sys.stdout = debugfh
    sys.stderr = debugfh

    dbh = None
    if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
        dbh = pfwdb.PFWDB(config.getfull('submit_des_services'),
                          config.getfull('submit_des_db_section'))

        # get job information from the job stdout if exists
        (tjobinfo, tjobinfo_task) = parse_job_output(config, jobnum, dbh, retval)

        if dbh and len(tjobinfo) > 0:
            print("tjobinfo: ", tjobinfo)
            dbh.update_tjob_info(config['task_id']['job'][jobnum], tjobinfo)

        # get job information from the condor job log
        logfilename = 'runjob.log'
        if os.path.exists(logfilename) and os.path.getsize(logfilename) > 0:  # if made it to submitting/running jobs
            try:
                # update job info in DB from condor log
                print("Updating job info in DB from condor log")
                condorjobinfo = pfwcondor.parse_condor_user_log(logfilename)
                if len(list(condorjobinfo.keys())) > 1:
                    print("More than single job in job log")
                j = list(condorjobinfo.keys())[0]
                cjobinfo = condorjobinfo[j]
                djobinfo = {}
                for ckey, dkey in list(condor2db.items()):
                    if ckey in cjobinfo:
                        djobinfo[dkey] = cjobinfo[ckey]
                print(djobinfo)
                dbh.update_job_info(config, cjobinfo['jobname'], djobinfo)

                if 'holdreason' in cjobinfo and cjobinfo['holdreason'] is not None:
                    msg = "Condor HoldReason: %s" % cjobinfo['holdreason']
                    print(msg)
                    if dbh:
                        Messaging.pfw_message(dbh, config['pfw_attempt_id'],
                                              config['task_id']['job'][jobnum],
                                              msg, pfwdefs.PFWDB_MSG_WARN)

                if 'abortreason' in cjobinfo and cjobinfo['abortreason'] is not None:
                    tjobinfo_task['start_time'] = cjobinfo['starttime']
                    tjobinfo_task['end_time'] = cjobinfo['endtime']
                    if 'condor_rm' in cjobinfo['abortreason']:
                        tjobinfo_task['status'] = pfwdefs.PF_EXIT_OPDELETE
                    else:
                        tjobinfo_task['status'] = pfwdefs.PF_EXIT_CONDOR
                else:
                    pass
            except Exception:
                (extype, exvalue, trback) = sys.exc_info()
                traceback.print_exception(extype, exvalue, trback, file=sys.stdout)
        else:
            print("Warning:  no job condor log file")

        if dbh:
            # update job task
            if 'status' not in tjobinfo_task:
                tjobinfo_task['status'] = pfwdefs.PF_EXIT_CONDOR
            if 'end_time' not in tjobinfo_task:
                tjobinfo_task['end_time'] = datetime.now()
            wherevals = {'id': config['task_id']['job'][jobnum]}
            dbh.basic_update_row('task', tjobinfo_task, wherevals)
            dbh.commit()

    log_pfw_event(config, blockname, jobnum, 'j', ['posttask', retval])

    # input wcl should already exist in untar form
    if os.path.exists(inputtar):
        print("found inputtar: %s" % inputtar)
        os.unlink(inputtar)
    else:
        print("Could not find inputtar: %s" % inputtar)

    # untar output wcl tar and delete tar
    if os.path.exists(outputtar):
        print("Size of output wcl tar:", os.path.getsize(outputtar))
        if os.path.getsize(outputtar) > 0:
            print("found outputtar: %s" % outputtar)
            pfwutils.untar_dir(outputtar, '..')
            os.unlink(outputtar)
        else:
            msg = "Warn: outputwcl tarball (%s) is 0 bytes." % outputtar
            print(msg)
            if dbh:
                Messaging.pfw_message(dbh, config['pfw_attempt_id'],
                                      config['task_id']['job'][jobnum],
                                      msg, pfwdefs.PFWDB_MSG_WARN)
    else:
        msg = "Warn: outputwcl tarball (%s) does not exist." % outputtar
        print(msg)
        if dbh:
            Messaging.pfw_message(dbh, config['pfw_attempt_id'],
                                  config['task_id']['job'][jobnum],
                                  msg, pfwdefs.PFWDB_MSG_WARN)

    if retval != pfwdefs.PF_EXIT_SUCCESS:
        miscutils.fwdebug_print("Setting failure retval")
        retval = pfwdefs.PF_EXIT_FAILURE

    miscutils.fwdebug_print("Returning retval = %s" % retval)
    miscutils.fwdebug_print("jobpost done")
    debugfh.close()
    return int(retval)
Esempio n. 9
0
def logpre(argv=None):
    """ Program entry point """
    if argv is None:
        argv = sys.argv

    default_log = 'logpre.out'
    debugfh = open(default_log, 'w')
    outorig = sys.stdout
    errorig = sys.stderr
    sys.stdout = debugfh
    sys.stderr = debugfh

    print(' '.join(sys.argv)) # command line for debugging

    if len(argv) < 5:
        print("Usage: logpre configfile block subblocktype subblock")
        debugfh.close()
        return pfwdefs.PF_EXIT_FAILURE

    configfile = argv[1]
    blockname = argv[2]    # could also be uberctrl
    subblocktype = argv[3]
    subblock = argv[4]

    # read sysinfo file
    config = pfwconfig.PfwConfig({'wclfile': configfile})

    # now that have more information, can rename output file
    miscutils.fwdebug_print("getting new_log_name")
    blockname = config.getfull('blockname')
    blkdir = config.getfull('block_dir')
    new_log_name = config.get_filename('block',
                                       {pfwdefs.PF_CURRVALS: {'subblock': subblock,
                                                              'flabel': '${subblock}_logpre',
                                                              'fsuffix':'out'}})
    new_log_name = f"{blkdir}/{new_log_name}"
    miscutils.fwdebug_print(f"new_log_name = {new_log_name}")
    debugfh.close()

    os.chmod(default_log, 0o666)
    os.rename(default_log, new_log_name)

    #debugfh.close()
    sys.stdout = outorig
    sys.stderr = errorig

    if 'use_qcf' in config and config['use_qcf']:
        if config.dbh is None:
            if 'submit_des_services' in config:
                os.environ['DES_SERVICES'] = config.getfull('submit_des_services')
            os.environ['DES_DB_SECTION'] = config.getfull('submit_des_db_section')

            debugfh = Messaging.Messaging(new_log_name, 'logpre.py', config['pfw_attempt_id'], mode='a+')
        else:
            debugfh = Messaging.Messaging(new_log_name, 'logpre.py', config['pfw_attempt_id'], dbh=config.dbh, mode='a+')

    else:
        debugfh = open(new_log_name, 'a+')
    sys.stdout = debugfh
    sys.stderr = debugfh


    log_pfw_event(config, blockname, subblock, subblocktype, ['pretask'])

    print("logpre done")
    debugfh.close()
    sys.stdout = outorig
    sys.stderr = errorig
    return pfwdefs.PF_EXIT_SUCCESS
Esempio n. 10
0
def logpost(argv=None):
    """ Program entry point """
    if argv is None:
        argv = sys.argv

    # open file to catch error messages about command line
    debugfh = open('logpost.out', 'w')
    outorig = sys.stdout
    errorig = sys.stderr
    sys.stdout = debugfh
    sys.stderr = debugfh

    print(' '.join(argv))  # print command line for debugging

    if len(argv) < 5:
        print("Usage: logpost configfile block subblocktype subblock retval")
        debugfh.close()
        return pfwdefs.PF_EXIT_FAILURE

    configfile = argv[1]
    blockname = argv[2]
    subblocktype = argv[3]
    subblock = argv[4]
    retval = pfwdefs.PF_EXIT_FAILURE
    if len(argv) == 6:
        retval = int(sys.argv[5])

    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print(f"configfile = {configfile}")
        miscutils.fwdebug_print(f"block = {blockname}")
        miscutils.fwdebug_print(f"subblock = {subblock}")
        miscutils.fwdebug_print(f"retval = {retval}")

    # read sysinfo file
    config = pfwconfig.PfwConfig({'wclfile': configfile})
    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print("done reading config file")

    # now that have more information, rename output file
    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print("before get_filename")
    blockname = config.getfull('blockname')
    blkdir = config.getfull('block_dir')
    new_log_name = config.get_filename(
        'block', {
            pfwdefs.PF_CURRVALS: {
                'flabel': '${subblock}_logpost',
                'subblock': subblock,
                'fsuffix': 'out'
            }
        })
    new_log_name = f"{blkdir}/{new_log_name}"
    miscutils.fwdebug_print(f"new_log_name = {new_log_name}")

    debugfh.close()
    sys.stdout = outorig
    sys.stderr = errorig

    os.chmod('logpost.out', 0o666)
    os.rename('logpost.out', new_log_name)
    if 'use_qcf' in config and config['use_qcf']:
        if config.dbh is None:
            if 'submit_des_services' in config:
                os.environ['DES_SERVICES'] = config.getfull(
                    'submit_des_services')
            os.environ['DES_DB_SECTION'] = config.getfull(
                'submit_des_db_section')

            debugfh = Messaging.Messaging(new_log_name,
                                          'logpost.py',
                                          config['pfw_attempt_id'],
                                          mode='a+')
        else:
            debugfh = Messaging.Messaging(new_log_name,
                                          'logpost.py',
                                          config['pfw_attempt_id'],
                                          dbh=config.dbh,
                                          mode='a+')

    else:
        debugfh = open(new_log_name, 'a+')

    sys.stdout = debugfh
    sys.stderr = debugfh

    log_pfw_event(config, blockname, subblock, subblocktype,
                  ['posttask', retval])

    # In order to continue, make pipelines dagman jobs exit with success status
    #if 'pipelinesmngr' not in subblock:
    #    retval = pfwdefs.PF_EXIT_SUCCESS

    #    # If error at non-manager level, send failure email
    #    if retval != pfwdefs.PF_EXIT_SUCCESS and \
    #        'mngr' not in subblock:
    #        send_subblock_email(config, blockname, subblock, retval)

    if subblock != 'begblock' and retval != pfwdefs.PF_EXIT_SUCCESS:
        miscutils.fwdebug_print("Setting failure retval")
        retval = pfwdefs.PF_EXIT_FAILURE

    miscutils.fwdebug_print(f"returning retval = {retval}")
    miscutils.fwdebug_print("logpost done")
    debugfh.close()
    sys.stdout = outorig
    sys.stderr = errorig
    miscutils.fwdebug_print(f"Exiting with = {retval}")
    return int(retval)
Esempio n. 11
0
def jobpre(argv=None):
    """Program entry point.
    """
    if argv is None:
        argv = sys.argv

    debugfh = tempfile.NamedTemporaryFile(mode='w+',
                                          prefix='jobpre_',
                                          dir='.',
                                          delete=False)
    tmpfn = debugfh.name
    sys.stdout = debugfh
    sys.stderr = debugfh

    print(' '.join(sys.argv))  # command line for debugging
    print(os.getcwd())

    if len(argv) < 3:
        print('Usage: jobpre configfile jobnum')
        debugfh.close()
        return pfwdefs.PF_EXIT_FAILURE

    configfile = sys.argv[1]
    jobnum = sys.argv[2]  # could also be uberctrl

    # read wcl file
    config = pfwconfig.PfwConfig({'wclfile': configfile})
    blockname = config.getfull('blockname')
    blkdir = config.get('block_dir')
    tjpad = pfwutils.pad_jobnum(jobnum)

    # now that have more information, can rename output file
    miscutils.fwdebug_print("getting new_log_name")
    new_log_name = config.get_filename(
        'job', {
            pfwdefs.PF_CURRVALS: {
                pfwdefs.PF_JOBNUM: jobnum,
                'flabel': 'jobpre',
                'fsuffix': 'out'
            }
        })
    new_log_name = "%s/%s/%s" % (blkdir, tjpad, new_log_name)
    miscutils.fwdebug_print("new_log_name = %s" % new_log_name)

    debugfh.close()
    os.chmod(tmpfn, 0o666)
    os.rename(tmpfn, new_log_name)
    debugfh = open(new_log_name, 'a+')
    sys.stdout = debugfh
    sys.stderr = debugfh

    if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
        dbh = pfwdb.PFWDB(config.getfull('submit_des_services'),
                          config.getfull('submit_des_db_section'))
        ctstr = dbh.get_current_timestamp_str()
        dbh.update_job_info(config, tjpad, {
            'condor_submit_time': ctstr,
            'target_submit_time': ctstr
        })

    log_pfw_event(config, blockname, tjpad, 'j', ['pretask'])

    miscutils.fwdebug_print("jobpre done")
    debugfh.close()
    return pfwdefs.PF_EXIT_SUCCESS
Esempio n. 12
0
def blockpost(argv=None):
    """ Program entry point """
    realstdout = sys.stdout
    realstderr = sys.stderr

    if argv is None:
        argv = sys.argv

    # open file to catch error messages about command line
    debugfh = open('blockpost.out', 'w')
    sys.stdout = debugfh
    sys.stderr = debugfh
    print(' '.join(argv))  # print command line for debugging

    print(f"running on {socket.gethostname()}")

    if len(argv) != 3:
        print('Usage: blockpost.py configfile retval')
        debugfh.close()
        return pfwdefs.PF_EXIT_FAILURE

    configfile = argv[1]
    retval = int(argv[2])

    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print(f"configfile = {configfile}")
    miscutils.fwdebug_print(f"retval = {retval}")

    # read sysinfo file
    config = pfwconfig.PfwConfig({'wclfile': configfile})
    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print("done reading config file")
    blockname = config.getfull('blockname')
    blkdir = config.getfull('block_dir')

    # now that have more information, can rename output file
    miscutils.fwdebug_print("getting new_log_name")
    new_log_name = config.get_filename('block',
                                       {pfwdefs.PF_CURRVALS: {'flabel': 'blockpost',
                                                              'fsuffix':'out'}})
    new_log_name = f"{blkdir}/{new_log_name}"
    miscutils.fwdebug_print(f"new_log_name = {new_log_name}")

    debugfh.close()
    os.chmod('blockpost.out', 0o666)
    os.rename('blockpost.out', new_log_name)
    debugfh = open(new_log_name, 'a+')
    sys.stdout = debugfh
    sys.stderr = debugfh

    os.chdir(blkdir)

    log_pfw_event(config, blockname, 'blockpost', 'j', ['posttask', retval])

    dryrun = config.getfull(pfwdefs.PF_DRYRUN)
    run = config.getfull('run')
    attid = config['pfw_attempt_id']
    blknum = int(config.getfull(pfwdefs.PF_BLKNUM))
    blktid = None

    msg2 = ""
    dbh = None
    qdbh = None
    job_byblk = {}
    wrap_byjob = {}
    wrapinfo = {}
    jobinfo = {}
    failedwraps = {}
    whyfailwraps = {}   # mod failures for other modname, shouldn't happen
    usedb = miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT))
    verify_files = miscutils.convertBool(config.getfull('verify_files'))
    verify_status = 0
    sem = None
    if verify_files and not usedb:
        print('Skipping file verification due to lack of database connection')
    if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)):
        try:
            miscutils.fwdebug_print("Connecting to DB")
            if config.dbh is None:
                dbh = pfwdb.PFWDB(config.getfull('submit_des_services'),
                                  config.getfull('submit_des_db_section'))
            else:
                dbh = config.dbh
            if verify_files:
                curs = dbh.cursor()
                curs.execute(f"select root from ops_archive where name='{config.getfull('home_archive')}'")
                rows = curs.fetchall()
                if rows is None or len(rows) != 1:
                    raise Exception(f"Invalid archive name ({config.getfull('home_archive')}).   Found {len(rows)} rows in ops_archive")
                root = rows[0][0]
                if not os.path.isdir(root):
                    print(f"Cannot read archive root directory:{config.getfull('home_archive')} This program must be run on an NCSA machine with access to the archive storage system.")
                sem = dbsem.DBSemaphore('verify_files_10', None, config.getfull('submit_des_services'), config.getfull('submit_des_db_section'), connection=dbh)
                print("\n\nVerifying archive file sizes on disk (0 is success)")
                verify_status = cu.compare(dbh=dbh, archive=config.getfull('home_archive'), pfwid=attid, md5sum=False, debug=False, script=False, verbose=False, silent=True)
                if sem is not None:
                    del sem
                    sem = None
                print(f"  Verification of files returned status {verify_status:d}")
                if verify_status != 0:
                    print("  This indicates that one or more files do not have the correct file size (based on DB entries). Run")
                    print(f"\n    compare_db.py --des_services {config.getfull('submit_des_services')} --section {config.getfull('submit_des_db_section')} --archive {config.getfull('home_archive')} --pfwid {int(attid):d} --verbose")
                    print("\n  to see the details.")

            if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_QCF)):
                import qcframework.qcfdb as qcfdb
                #qdbh = qcfdb.QCFDB(config.getfull('submit_des_services'),
                #                   config.getfull('submit_des_db_section'))
                qdbh = qcfdb.QCFDB(connection=dbh)

            print(f"\n\nChecking non-job block task status from task table in DB ({pfwdefs.PF_EXIT_SUCCESS} is success)")
            num_bltasks_failed = 0
            bltasks = {}
            blktid = None
            if ('block' in config['task_id'] and
                    str(blknum) in config['task_id']['block']):
                blktid = int(config['task_id']['block'][str(blknum)])
                miscutils.fwdebug_print("Getting block task info from DB")
                start_time = time.time()
                bltasks = dbh.get_block_task_info(blktid)
                end_time = time.time()
                miscutils.fwdebug_print(f"Done getting block task info from DB ({end_time - start_time} secs)")
                for bltdict in bltasks.values():
                    print("Block status = ", bltdict['status'])
                    if bltdict['status'] == pfwdefs.PF_EXIT_DRYRUN:
                        print("setting return value to dryrun")
                        retval = bltdict['status']
                    elif bltdict['status'] != pfwdefs.PF_EXIT_SUCCESS:
                        num_bltasks_failed += 1
                        msg2 += f"\t{bltdict['name']}"
                        if bltdict['label'] is not None:
                            msg2 += f" - {bltdict['label']}"
                        msg2 += " failed\n"

                        if bltdict['name'] == 'begblock':
                            # try to read the begblock.out and begblock.err files
                            print("Trying to get begblock.out and begblock.err")
                            msg2 += get_subblock_output("begblock")

                            # try to get QCF messages (especially from query codes)
                            begblock_tid = int(config['task_id']['begblock'])
                            sql = f"select id from task where parent_task_id={begblock_tid:d} and status!=0"
                            curs = dbh.cursor()
                            curs.execute(sql)
                            res = curs.fetchall()
                            msg2 += "\n===== QCF Messages =====\n"
                            msg2 += "\n begblock\n"
                            wrapids = [blktid, begblock_tid]
                            for r in res:
                                wrapids.append(r[0])

                            wrapmsg = {}
                            if qdbh is not None:
                                miscutils.fwdebug_print("Querying QCF messages")
                                start_time = time.time()
                                wrapmsg = qdbh.get_qcf_messages_for_wrappers(wrapids)
                                end_time = time.time()
                                miscutils.fwdebug_print(f"Done querying QCF messages ({end_time-start_time} secs)")
                                miscutils.fwdebug_print(f"wrapmsg = {wrapmsg}")
                            if not wrapmsg:
                                msg2 += "    No QCF messages\n"
                            else:
                                for msgs in wrapmsg.values():
                                    for m in msgs:
                                        msg2 += "    " + m['message'] + "\n"

                        retval = pfwdefs.PF_EXIT_FAILURE

                if retval != pfwdefs.PF_EXIT_DRYRUN:
                    print(f"\n\nChecking job status from pfw_job table in DB ({pfwdefs.PF_EXIT_SUCCESS} is success)")

                    miscutils.fwdebug_print("Getting job info from DB")
                    start_time = time.time()
                    jobinfo = dbh.get_job_info({'pfw_block_task_id': blktid})
                    end_time = time.time()
                    miscutils.fwdebug_print(f"Done getting job info from DB ({end_time - start_time} secs)")
                    miscutils.fwdebug_print("Getting wrapper info from DB")
                    start_time = time.time()
                    wrapinfo = dbh.get_wrapper_info(pfw_attempt_id=attid, pfw_block_task_id=blktid)
                    if retval != pfwdefs.PF_EXIT_SUCCESS:
                        jobwrap = dbh.get_jobwrapper_info(id=attid)
                    else:
                        jobwrap = {}
                    end_time = time.time()
                    miscutils.fwdebug_print(f"Done getting wrapper info from DB ({end_time - start_time} secs)")
            else:
                msg = f"Could not find task id for block {blockname} in config.des"
                print("Error:", msg)
                if 'attempt' in config['task_id']:
                    miscutils.fwdebug_print("Saving pfw message")
                    start_time = time.time()
                    Messaging.pfw_message(dbh, attid, config['task_id']['attempt'],
                                          msg, pfwdefs.PFWDB_MSG_INFO, 'blockpost.out', 0)
                    end_time = time.time()
                    miscutils.fwdebug_print(f"Done saving pfw message ({end_time - start_time} secs)")
                print("all the task ids:", config['task_id'])


            archive = None
            if pfwdefs.HOME_ARCHIVE in config:
                archive = config.getfull(pfwdefs.HOME_ARCHIVE)
            logfullnames = dbh.get_log_fullnames(attid, archive)
            #dbh.close()
            print("len(jobinfo) = ", len(jobinfo))
            print("len(wrapinfo) = ", len(wrapinfo))
            job_byblk = pfwutils.index_job_info(jobinfo)
            print("blktid: ", blktid)
            print("job_byblk:", job_byblk)

            if blktid not in job_byblk:
                print(f"Warn: could not find jobs for block {blknum}")
                print("      This is ok if attempt died before jobs ran")
                print("      block task_ids in job_byblk:", list(job_byblk.keys()))
            else:
                wrap_byjob, _ = pfwutils.index_wrapper_info(wrapinfo)
                #for wid,jwr in jobwrap.iteritems():
                    #print wid,jwr

                # in case the post wrapper stuff failed, internally mark the task
                # as failed to retrieve the info later
                for wrapb in wrap_byjob.values():
                    for wrapper in wrapb.values():
                        if wrapper['parent_task_id'] in jobwrap and jobwrap[wrapper['parent_task_id']]['status'] is not None \
                           and wrapper['status'] is not None and jobwrap[wrapper['parent_task_id']]['status'] > wrapper['status']:
                            wrapper['status'] = jobwrap[wrapper['parent_task_id']]['status']

                #print "wrap_bymod:", wrap_bymod
                jobtid = ''
                jobdict = {}
                for jobtid, jobdict in sorted(job_byblk[blktid].items()):
                    failedwraps[jobtid] = []
                    whyfailwraps[jobtid] = []

                    jobkeys = ""

                    # don't print out successful wrappers
                    if jobtid in wrap_byjob and jobdict['status'] == pfwdefs.PF_EXIT_SUCCESS:
                        continue

                    if jobdict['jobkeys'] is not None:
                        jobkeys = jobdict['jobkeys']
                        #print "jobkeys = ", jobkeys, type(jobkeys)

                    submit_job_path = f"{config.getfull('work_dir')}/B{int(config.getfull('blknum')):02d}-{config.getfull('blockname'):s}/{int(jobdict['jobnum']):04d}"
                    msg2 += f"\n\t{pfwutils.pad_jobnum(jobdict['jobnum'])} ({jobkeys}) "


                    if jobtid not in wrap_byjob:
                        msg2 += "\tNo wrapper instances"
                    else:
                        #print "wrapnum in job =", wrap_byjob[jobtid].keys()
                        maxwrap = max(wrap_byjob[jobtid])
                        #print "maxwrap =", maxwrap
                        modname = wrap_byjob[jobtid][maxwrap]['modname']
                        #print "modname =", modname

                        msg2 += f"{len(wrap_byjob[jobtid]):d}/{jobdict['expect_num_wrap']}  {modname}"

                        # determine wrappers for this job without success exit
                        for wrapnum, wdict in wrap_byjob[jobtid].items():
                            if wdict['status'] is None or wdict['status'] != pfwdefs.PF_EXIT_SUCCESS:
                                if wdict['modname'] == modname:
                                    failedwraps[jobtid].append(wrapnum)
                                else:
                                    whyfailwraps[jobtid].append(wrapnum)

                    if jobdict['status'] == pfwdefs.PF_EXIT_EUPS_FAILURE:
                        msg2 += " - FAIL - EUPS setup failure"
                        retval = jobdict['status']
                    elif jobdict['status'] == pfwdefs.PF_EXIT_CONDOR:
                        msg2 += " - FAIL - Condor/Globus failure"
                        retval = jobdict['status']
                    elif jobdict['status'] is None:
                        msg2 += " - FAIL - NULL status"
                        retval = pfwdefs.PF_EXIT_FAILURE
                    elif jobdict['status'] != pfwdefs.PF_EXIT_SUCCESS:
                        msg2 += " - FAIL - Non-zero status"
                        retval = jobdict['status']

                if jobdict['status'] != pfwdefs.PF_EXIT_SUCCESS:
                    msg2 += f"\n\t\t{submit_job_path}/runjob.out "

                msg2 += '\n'

                # print pfw_messages
                if 'message' in jobdict:
                    print('\nmessages: ', jobdict['message'])
                    for msgdict in sorted(jobdict['message'], key=lambda k: k['message_time']):
                        level = int(msgdict['message_lvl'])
                        levelstr = 'info'
                        if level == pfwdefs.PFWDB_MSG_WARN:
                            levelstr = 'WARN'
                        elif level == pfwdefs.PFWDB_MSG_ERROR:
                            levelstr = 'ERROR'

                        msg2 += "\t\t{} - {}\n".format(levelstr, msgdict['message'].replace('\n', '\n\t\t\t'))

                if jobtid in wrap_byjob:
                    # print log file name for failed/unfinished wrappers
                    for wrapnum in failedwraps[jobtid]:
                        wrapdict = wrap_byjob[jobtid][wrapnum]
                        if wrapdict['log'] in logfullnames:
                            msg2 += f"\t\t{wrapnum} - {logfullnames[wrapdict['log']]}\n"
                        else:
                            msg2 += f"\t\t{wrapnum} - Could not find log in archive {wrapdict['log']})\n"
                        wrapmsg = get_qcf_messages(qdbh, [wrapdict['task_id']])
                        msg2 += print_qcf_messages(wrapdict, wrapmsg)

                    msg2 += '\n'

                    # If weirdness happened in run, print a message
                    if whyfailwraps[jobtid]:
                        msg2 += "\n*** Contact framework developers.   Wrappers ran after at least 1 wrapper from a previous module that doesn't have success status.\n"
                        msg2 += f"\t{','.join(whyfailwraps[jobtid])}\n"

        except Exception as exc:
            if sem is not None:
                del sem
            msg2 += "\n\nEncountered error trying to gather status information for email."
            msg2 += "\nCheck output for blockpost for further details."
            print("\n\nEncountered error trying to gather status information for email")
            print(f"{exc.__class__.__name__}: {str(exc)}")
            (extype, exvalue, trback) = sys.exc_info()
            traceback.print_exception(extype, exvalue, trback, file=sys.stdout)
            retval = pfwdefs.PF_EXIT_FAILURE
    retval = int(retval) + verify_status
    print("before email retval =", retval)

    when_to_email = 'run'
    if 'when_to_email' in config:
        when_to_email = config.getfull('when_to_email').lower()

    if miscutils.convertBool(dryrun):
        if when_to_email != 'never':
            print("dryrun = ", dryrun)
            print("Sending dryrun email")
            if retval == pfwdefs.PF_EXIT_DRYRUN:
                msg1 = f"{run}:  In dryrun mode, block {blockname} has finished successfully."
            else:
                msg1 = f"{run}:  In dryrun mode, block {blockname} has failed."

            send_email(config, blockname, retval, "", msg1, msg2)
        else:
            print("Not sending dryrun email")
            print("retval = ", retval)
        retval = pfwdefs.PF_EXIT_DRYRUN
    elif retval:
        if when_to_email != 'never':
            print("Sending block failed email\n")
            msg1 = f"{run}:  block {blockname} has failed."
            send_email(config, blockname, retval, "", msg1, msg2)
        else:
            print("Not sending failed email")
            print("retval = ", retval)
    elif retval == pfwdefs.PF_EXIT_SUCCESS:
        if when_to_email == 'block':
            msg1 = f"{run}:  block {blockname} has finished successfully."
            msg2 = ""
            print("Sending success email\n")
            send_email(config, blockname, retval, "", msg1, msg2)
        elif when_to_email == 'run':
            numblocks = len(miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ','))
            if int(config[pfwdefs.PF_BLKNUM]) == numblocks:
                msg1 = f"{run}:  run has finished successfully."
                msg2 = ""
                print("Sending success email\n")
                send_email(config, blockname, retval, "", msg1, msg2)
            else:
                print("Not sending run email because not last block")
                print("retval = ", retval)
        else:
            print("Not sending success email")
            print("retval = ", retval)
    else:
        print("Not sending email")
        print("retval = ", retval)

    # Store values in DB and hist file
    #dbh = None
    if miscutils.convertBool(config[pfwdefs.PF_USE_DB_OUT]):
        if dbh is None:
            dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section'))
        if blktid is not None:
            print("Updating end of block task", blktid)
            dbh.end_task(blktid, retval, True)
        else:
            print("Could not update end of block task without block task id")
        if retval != pfwdefs.PF_EXIT_SUCCESS:
            print("Updating end of attempt", config['task_id']['attempt'])
            dbh.end_task(config['task_id']['attempt'], retval, True)
        dbh.commit()
        #dbh.close()

    print("before next block retval = ", retval)
    if retval == pfwdefs.PF_EXIT_SUCCESS:
        # Get ready for next block
        config.inc_blknum()
        with open(configfile, 'w') as cfgfh:
            config.write(cfgfh)
        print("new blknum = ", config[pfwdefs.PF_BLKNUM])
        print("number of blocks = ", len(miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ',')))
        if int(config[pfwdefs.PF_BLKNUM]) > len(miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ',')) and  miscutils.convertBool(config[pfwdefs.PF_USE_DB_OUT]):
            #dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section'))
            updatevals = {'PROCESSING_STATE': 'PASS'}
            wherevals = {'PFW_ATTEMPT_ID': attid}
            dbh.basic_update_row('ATTEMPT_STATE', updatevals, wherevals)
            dbh.commit()
            #dbh.close()
    elif miscutils.convertBool(config[pfwdefs.PF_USE_DB_OUT]):
        #dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section'))
        updatevals = {'PROCESSING_STATE': 'FAIL'}
        wherevals = {'PFW_ATTEMPT_ID': attid}
        dbh.basic_update_row('ATTEMPT_STATE', updatevals, wherevals)
        dbh.commit()
        #dbh.close()
    if dbh is not None:
        dbh.close()
    miscutils.fwdebug_print(f"Returning retval = {retval} ({type(retval)})")
    miscutils.fwdebug_print("END")
    debugfh.close()
    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print(f"Exiting with = {exitcode}")
        miscutils.fwdebug_print(f"type of exitcode = {type(exitcode)}")

    sys.stdout = realstdout
    sys.stderr = realstderr

    return int(retval)
Esempio n. 13
0
def blockpre(argv=None):
    """Program entry point.
    """
    if argv is None:
        argv = sys.argv

    default_log = 'blockpre.out'

    debugfh = open(default_log, 'w')
    sys.stdout = debugfh
    sys.stderr = debugfh

    print(' '.join(sys.argv))  # command line for debugging

    if len(argv) < 2 or len(argv) > 3:
        print('Usage: blockpre configfile')
        debugfh.close()
        return pfwdefs.PF_EXIT_FAILURE

    configfile = sys.argv[1]

    # read sysinfo file
    config = pfwconfig.PfwConfig({'wclfile': configfile})

    # make sure values which depend upon block are set correctly
    config.set_block_info()
    miscutils.fwdebug_print("blknum = %s" % config[pfwdefs.PF_BLKNUM])

    with open(configfile, 'w') as cfgfh:
        config.write(cfgfh)

    blockname = config.getfull('blockname')
    miscutils.fwdebug_print("blockname = %s" % blockname)

    blkdir = config.getfull('block_dir')

    # now that have more information, can rename output file
    miscutils.fwdebug_print("getting new_log_name")
    new_log_name = config.get_filename(
        'block',
        {pfwdefs.PF_CURRVALS: {
            'flabel': 'blockpre',
            'fsuffix': 'out'
        }})
    new_log_name = "%s/%s" % (blkdir, new_log_name)
    miscutils.fwdebug_print("new_log_name = %s" % new_log_name)

    debugfh.close()
    os.chmod(default_log, 0o666)
    os.rename(default_log, new_log_name)
    debugfh = open(new_log_name, 'a+')
    sys.stdout = debugfh
    sys.stderr = debugfh

    os.chdir(blkdir)

    write_block_condor(config)

    log_pfw_event(config, blockname, 'blockpre', 'j', ['pretask'])

    miscutils.fwdebug_print("blockpre done")
    debugfh.close()

    return pfwdefs.PF_EXIT_SUCCESS
Esempio n. 14
0
def logpost(argv=None):
    """Program entry point.
    """
    if argv is None:
        argv = sys.argv

    # open file to catch error messages about command line
    debugfh = open('logpost.out', 'w')
    sys.stdout = debugfh
    sys.stderr = debugfh

    print(' '.join(argv))  # print command line for debugging

    if len(argv) < 5:
        print('Usage: logpost configfile block subblocktype subblock retval')
        debugfh.close()
        return pfwdefs.PF_EXIT_FAILURE

    configfile = argv[1]
    blockname = argv[2]
    subblocktype = argv[3]
    subblock = argv[4]
    retval = pfwdefs.PF_EXIT_FAILURE
    if len(argv) == 6:
        retval = int(sys.argv[5])

    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print("configfile = %s" % configfile)
        miscutils.fwdebug_print("block = %s" % blockname)
        miscutils.fwdebug_print("subblock = %s" % subblock)
        miscutils.fwdebug_print("retval = %s" % retval)

    # read sysinfo file
    config = pfwconfig.PfwConfig({'wclfile': configfile})
    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print("done reading config file")

    # now that have more information, rename output file
    if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'):
        miscutils.fwdebug_print("before get_filename")
    blockname = config.getfull('blockname')
    blkdir = config.getfull('block_dir')
    new_log_name = config.get_filename(
        'block', {
            pfwdefs.PF_CURRVALS: {
                'flabel': '${subblock}_logpost',
                'subblock': subblock,
                'fsuffix': 'out'
            }
        })
    new_log_name = "%s/%s" % (blkdir, new_log_name)
    miscutils.fwdebug_print("new_log_name = %s" % new_log_name)

    debugfh.close()
    os.chmod('logpost.out', 0o666)
    os.rename('logpost.out', new_log_name)
    debugfh = open(new_log_name, 'a+')
    sys.stdout = debugfh
    sys.stderr = debugfh

    log_pfw_event(config, blockname, subblock, subblocktype,
                  ['posttask', retval])

    # In order to continue, make pipelines dagman jobs exit with success status
    #if 'pipelinesmngr' not in subblock:
    #    retval = pfwdefs.PF_EXIT_SUCCESS

    #    # If error at non-manager level, send failure email
    #    if retval != pfwdefs.PF_EXIT_SUCCESS and \
    #        'mngr' not in subblock:
    #        send_subblock_email(config, blockname, subblock, retval)

    if subblock != 'begblock' and retval != pfwdefs.PF_EXIT_SUCCESS:
        miscutils.fwdebug_print("Setting failure retval")
        retval = pfwdefs.PF_EXIT_FAILURE

    miscutils.fwdebug_print("returning retval = %s" % retval)
    miscutils.fwdebug_print("logpost done")
    debugfh.close()
    return int(retval)