Ejemplo n.º 1
0
 def _load_raw(branches):
     pieces = []
     for fn, n in zip(md.inputfiles, md.num_events):
         step = int(math.ceil(frac * n))
         start = step * jobid
         stop = start + step
         if start >= n:
             continue
         filepath = xrd(fn) if batch_mode else fn
         #         logging.debug('Load events [%d, %d) from file %s' % (start, stop, filepath))
         trial = 0
         while trial < 5:
             try:
                 a = root2array(filepath,
                                treename=md.treename,
                                selection=md.selection,
                                branches=branches,
                                start=start,
                                stop=stop)
                 break
             except:
                 logging.error('Error reading %s:\n%s' %
                               (filepath, traceback.format_exc()))
                 time.sleep(10)
                 trial += 1
         if trial >= 5:
             raise RuntimeError('Cannot read file %s' % filepath)
         pieces.append(a)
     rec = np.concatenate(pieces)
     return rec
Ejemplo n.º 2
0
def submit(args):
    
    scriptfile = os.path.join(args.jobdir, 'runjob.sh')
    metadatafile = os.path.join(args.jobdir, args.metadata)

    if not args.resubmit:
        from helper import xrd
        md, njobs = update_metadata(args)

        script = \
'''#!/bin/bash
jobid=$1
workdir=`pwd`

echo `hostname`
echo "workdir: $workdir"
echo "args: $@"
ls -l

export PATH={conda_path}:$PATH
source activate {conda_env_name}
echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"

python {script} {outputdir} $jobid -n {events} {test_sample}
status=$?
echo "Status = $status"
ls -l

if [ $status -ne 0 ]; then
    exit $status
else
    echo
    {xrdcp}
fi

exit $status
'''.format(conda_path=args.conda_path,
           conda_env_name=args.conda_env_name,
           script=os.path.abspath('converter.py'),
           outputdir=args.outputdir,
           events=args.events_per_file,
           test_sample='--test-sample' if args.test_sample else '',
           xrdcp='' if not args.outputdir.startswith('/eos') else 'xrdcp -np *.h5 %s ; rm *.h5' % (xrd(args.outputdir) + '/')
           )

        with open(scriptfile, 'w') as f:
            f.write(script)
        os.system('chmod +x %s' % scriptfile)

        jobids = [str(jobid) for jobid in range(njobs)]
        jobids_file = os.path.join(args.jobdir, 'submit.txt')

    else:
        # resubmit
        jobids = []
        jobids_file = os.path.join(args.jobdir, 'resubmit.txt')
        log_files = [f for f in os.listdir(args.jobdir) if f.endswith('.log')]
        for fn in log_files:
            with open(os.path.join(args.jobdir, fn)) as logfile:
                errormsg = None
                for line in reversed(logfile.readlines()):
                    if 'Job removed' in line or 'aborted' in line:
                        errormsg = line
                    if 'Job submitted from host' in line:
                        # if seeing this first: the job has been resubmited
                        break
                    if 'return value' in line:
                        if 'return value 0' not in line:
                            errormsg = line
                        break
                if errormsg:
                    logging.debug(fn + '\n   ' + errormsg)
                    jobids.append(fn.split('.')[0])
                    assert jobids[-1].isdigit()

    with open(jobids_file, 'w') as f:
        f.write('\n'.join(jobids))

    condordesc = '''\
universe              = vanilla
requirements          = (Arch == "X86_64") && (OpSys == "LINUX")
request_disk          = 10000000
executable            = {scriptfile}
arguments             = $(jobid)
transfer_input_files  = {metadatafile}
output                = {jobdir}/$(jobid).out
error                 = {jobdir}/$(jobid).err
log                   = {jobdir}/$(jobid).log
use_x509userproxy     = true
+MaxRuntime           = 172800
Should_Transfer_Files = YES
queue jobid from {jobids_file}
'''.format(scriptfile=os.path.abspath(scriptfile),
           metadatafile=os.path.abspath(metadatafile),
           jobdir=os.path.abspath(args.jobdir),
           outputdir=args.outputdir,
           jobids_file=os.path.abspath(jobids_file)
    )
    condorfile = os.path.join(args.jobdir, 'submit.cmd')
    with open(condorfile, 'w') as f:
        f.write(condordesc)

    print('Run the following command to submit the jobs:\ncondor_submit {condorfile}'.format(condorfile=condorfile))
Ejemplo n.º 3
0
def writeData(md,
              outputdir,
              jobid,
              batch_mode=False,
              test_sample=False,
              events=200000,
              dryrun=False):
    ''' Convert input files to a HDF file. '''
    from root_numpy import root2array

    def _write(rec, output):
        logging.debug(log_prefix + 'Start making output file')
        with tables.open_file(output, mode='w') as h5file:
            _make_labels(md, rec, h5file)
            logging.debug(log_prefix + 'Start producing weights')
            _make_weight(md, rec, h5file)
            _make_class_weight(md, rec, h5file)
            logging.debug(log_prefix + 'Start transforming variables')
            _transform_var(md,
                           rec,
                           h5file,
                           md.var_no_transform_branches,
                           no_transform=True)
            _transform_var(md, rec, h5file, md.var_branches)
            if md.var_img:
                logging.debug(log_prefix + 'Start making images')
                _make_image(md, rec, h5file, output='img')

    log_prefix = '[%d] ' % jobid
    outname = '{type}_file_{jobid}.h5'.format(
        type='test' if test_sample else 'train', jobid=jobid)
    output = os.path.join(outputdir, outname)
    if os.path.exists(output) and os.path.getsize(output) > 100 * 1024 * 1024:
        # ignore if > 100M
        logging.info(log_prefix + 'File %s already exist! Skipping.' % output)
        return

    frac = float(events) / sum(md.num_events)
    use_branches = set(md.var_branches + md.var_no_transform_branches +
                       md.label_branches + md.reweight_classes +
                       md.reweight_var)
    if md.var_img:
        use_branches |= set([md.var_img] + md.var_pos)


#     use_branches = [str(var) for var in use_branches]
    logging.debug(log_prefix + 'Start loading from root files')

    pieces = []
    for fn, n in zip(md.inputfiles, md.num_events):
        step = int(math.ceil(frac * n))
        start = step * jobid
        stop = start + step
        if start >= n:
            continue
        filepath = xrd(fn) if batch_mode else fn
        #         logging.debug('Load events [%d, %d) from file %s' % (start, stop, filepath))
        a = root2array(filepath,
                       treename=md.treename,
                       selection=md.selection,
                       branches=use_branches,
                       start=start,
                       stop=stop)
        pieces.append(a)
    rec = np.concatenate(pieces)
    if rec.shape[0] == 0:
        return
    if not test_sample:
        # important: shuffle the array if not for testing
        np.random.shuffle(rec)

    if batch_mode:
        if not dryrun:
            _write(rec, outname)
        logging.info(log_prefix + 'Writing output to: \n' + outname)
    else:
        output_tmp = output + '.tmp'
        if not dryrun:
            _write(rec, output_tmp)
            os.rename(output_tmp, output)
        logging.info(log_prefix + 'Writing output to: \n' + output)

    logging.info(log_prefix + 'Done!')
Ejemplo n.º 4
0
def submit(args):
    
    scriptfile = os.path.join(args.jobdir, 'runjob.sh')
    metadatafile = os.path.join(args.jobdir, args.metadata)

    tarball = os.path.join(args.jobdir, args.tarball)
    dir_path = os.path.dirname(os.path.realpath(__file__))
    import tarfile
    with tarfile.open(tarball, mode='w:gz') as archive:
        archive.add(dir_path, arcname='preprocessing', recursive=True)

    if not args.resubmit:
        from helper import xrd
        md, njobs = update_metadata(args)

        if 'LCG_VERSION' in os.environ:
            env_setup = 'source %s\n' % args.lcg_env
            env_setup += 'tar xvzf preprocessing.tar.gz\n'
            env_setup += 'export PYTHONPATH=`pwd`/preprocessing/.local/lib/python3.6/site-packages:$PYTHONPATH\n'
        else:
            env_setup = '''export PATH={conda_path}:$PATH
source activate {conda_env_name}'''.format(conda_path=args.conda_path, conda_env_name=args.conda_env_name)

        script = \
'''#!/bin/bash
jobid=$1
workdir=`pwd`

echo `hostname`
echo "workdir: $workdir"
echo "args: $@"
ls -l

{env_setup}

echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"

python {script} {outputdir} $jobid -n {events} {test_sample}
status=$?
echo "Status = $status"
ls -l

if [ $status -ne 0 ]; then
    exit $status
else
    echo
    {xrdcp}
fi

exit $status
'''.format(env_setup=env_setup,
           script='preprocessing/converter.py',
           outputdir=args.outputdir,
           events=args.events_per_file,
           test_sample='--test-sample' if args.test_sample else '',
           xrdcp='' if not args.outputdir.startswith('/eos') else 'xrdcp -np *.h5 %s ; rm *.h5' % (xrd(args.outputdir) + '/')
           )

        with open(scriptfile, 'w') as f:
            f.write(script)
        os.system('chmod +x %s' % scriptfile)

        jobids = [str(jobid) for jobid in range(njobs)]
        jobids_file = os.path.join(args.jobdir, 'submit.txt')

    else:
        # resubmit
        jobids = []
        jobids_file = os.path.join(args.jobdir, 'resubmit.txt')
        log_files = [f for f in os.listdir(args.jobdir) if f.endswith('.log')]
        for fn in log_files:
            with open(os.path.join(args.jobdir, fn)) as logfile:
                errormsg = None
                for line in reversed(logfile.readlines()):
                    if 'Job removed' in line or 'aborted' in line:
                        errormsg = line
                    if 'Job submitted from host' in line:
                        # if seeing this first: the job has been resubmited
                        break
                    if 'return value' in line:
                        if 'return value 0' not in line:
                            errormsg = line
                        break
                if errormsg:
                    logging.debug(fn + '\n   ' + errormsg)
                    jobids.append(fn.split('.')[0])
                    assert jobids[-1].isdigit()

    with open(jobids_file, 'w') as f:
        f.write('\n'.join(jobids))

    condordesc = '''\
universe              = vanilla
request_disk          = 10000000
request_memory        = 8192
executable            = {scriptfile}
arguments             = $(jobid)
transfer_input_files  = {metadatafile},{tarball}
output                = {jobdir}/$(jobid).out
error                 = {jobdir}/$(jobid).err
log                   = {jobdir}/$(jobid).log
use_x509userproxy     = true
+MaxRuntime           = 24000
Should_Transfer_Files = YES
queue jobid from {jobids_file}
'''.format(scriptfile=os.path.abspath(scriptfile),
           metadatafile=os.path.abspath(metadatafile),
           tarball = os.path.abspath(tarball),
           jobdir=os.path.abspath(args.jobdir),
           outputdir=args.outputdir,
           jobids_file=os.path.abspath(jobids_file)
    )
    condorfile = os.path.join(args.jobdir, 'submit.cmd')
    with open(condorfile, 'w') as f:
        f.write(condordesc)

    print('Run the following command to submit the jobs:\n condor_submit {condorfile}'.format(condorfile=condorfile))