Esempio n. 1
0
File: prep.py Progetto: tsusa/Xbb
def parse_sample_config(samples_info):

    parser = myutils.BetterConfigParser()
    parser.read(samples_info)

    LFN_dir = parser.get('General', 'LFN_dir')
    xrd_redirector = parser.get('General', 'xrd_redirector')
    merge = parser.getboolean('General', 'merge')
    samples = myutils.ParseInfo(samples_info, '')

    return LFN_dir, xrd_redirector, merge, samples
Esempio n. 2
0
else:
    print "Unknown Pt region"
    pt_region = 'NoSysRegion'
    #sys.exit("Unknown Pt region")
# Set rescale factor of 2 in case of TrainFalg
if TrainFlag:
    MC_rescale_factor=2.
    print 'I RESCALE BY 2.0'
else: MC_rescale_factor = 1.
#systematics up/down
UD = ['Up','Down']

print 'Parse the sample information'
print '============================\n'
#Parse samples configuration
info = myutils.ParseInfo(samplesinfo,path)
# get all the treeCut sets
# create different sample Lists

print 'Get the sample list'
print '===================\n'
all_samples = info.get_samples(signals+backgrounds+additionals)
print 'workspace_datacard-all_samples:',[job.name for job in all_samples]

signal_samples = info.get_samples(signals)
print 'signal samples:',[job.name for job in signal_samples]

background_samples = info.get_samples(backgrounds)
data_sample_names = config.get('dc:%s'%var,'data').split(' ')
data_samples = info.get_samples(data_sample_names)
Esempio n. 3
0
def main(argv=None):

    args = parse_command_line(argv)

    timestamp = time.strftime('%a_%b_%d_%Y_%H-%M-%S')

    if args.verbose:
        print 'List of Samples: {!s}\n'.format(args.samples)
        print 'Current Timestamp: {}\n'.format(timestamp)

    # the list of the config is taken from the path config
    configDir = args.tag + 'config'
    parser = myutils.BetterConfigParser()
    parser.read(os.path.join(configDir, 'paths.ini'))
    configs = [
        os.path.join(configDir, config)
        for config in parser.get('Configuration', 'List').split()
    ]
    parser.read(configs)

    if args.verbose:
        print 'configs', configs
        print 'args.tagdir', args.tagdir

    if args.tagdir != '':
        tagDir = parser.get('Directories', 'tagDir')
        if args.verbose:
            print 'tagDir', tagDir

        DirStruct = {
            'tagDir': tagDir,
            'ftagdir': '%s/%s/' % (tagDir, args.tagdir),
            'logpath': '%s/%s/%s/' % (tagDir, args.tagdir, 'Logs'),
            'plotpath': '%s/%s/%s/' % (tagDir, args.tagdir, 'Plots'),
            'limitpath': '%s/%s/%s/' % (tagDir, args.tagdir, 'Limits'),
            'confpath': '%s/%s/%s/' % (tagDir, args.tagdir, 'config')
        }

        if args.verbose:
            print 'DirStruct', DirStruct

        for keys in [
                'tagDir', 'ftagdir', 'logpath', 'plotpath', 'limitpath',
                'confpath'
        ]:
            try:
                os.stat(DirStruct[keys])
            except:
                os.mkdir(DirStruct[keys])

        pathfile = open('%sconfig/paths.ini' % args.tag)
        buffer = pathfile.readlines()
        pathfile.close()
        os.rename('%sconfig/paths.ini' % args.tag,
                  '%sconfig/paths.ini.bkp' % args.tag)
        pathfile = open('%sconfig/paths.ini' % args.tag, 'w')
        for line in buffer:
            if line.startswith('plotpath'):
                line = 'plotpath = %s\n' % DirStruct['plotpath']
            elif line.startswith('logpath'):
                line = 'logpath = %s\n' % DirStruct['logpath']
            elif line.startswith('limits'):
                line = 'limits = %s\n' % DirStruct['limitpath']
            pathfile.write(line)
        pathfile.close()

        #copy config files
        for item in configs:
            shutil.copyfile(
                item, '%s/%s/%s' % (tagDir, args.tagdir, item.strip(args.tag)))

    # RETRIEVE RELEVANT VARIABLES FROM CONFIG FILES AND FROM COMMAND LINE OPTIONS
    logPath = parser.get('Directories', 'logpath')
    samplesinfo = parser.get('Directories', 'samplesinfo')
    whereToLaunch = parser.get('Configuration', 'whereToLaunch')
    run_locally = parser.getboolean('Configuration', 'run_locally')

    if args.local:
        print 'Overriding the "run_locally" configuration option with the local flag.\n'
        run_locally = True
    elif args.batch:
        print 'Overriding the "run_locally" configuration option with the batch flag.\n'
        run_locally = False

    print 'whereToLaunch', whereToLaunch
    print 'run_locally', run_locally

    # CREATE DIRECTORIES FOR PSI
    if 'PSI' in whereToLaunch:
        print 'Create the ouput folders PREPout, SYSout, MVAout if not existing'

        mkdir_list = [
            parser.get('Directories',
                       'PREPout').replace('root://t3dcachedb03.psi.ch:1094/',
                                          ''),
            parser.get('Directories',
                       'SYSout').replace('root://t3dcachedb03.psi.ch:1094/',
                                         ''),
            parser.get('Directories',
                       'MVAout').replace('root://t3dcachedb03.psi.ch:1094/',
                                         ''),
            parser.get('Directories', 'tmpSamples').replace(
                'root://t3dcachedb03.psi.ch:1094/', ''),
        ]

        for mkdir_protocol in mkdir_list:
            if args.verbose:
                print 'checking', mkdir_protocol
            _output_folder = ''
            for _folder in mkdir_protocol.split('/'):
                _output_folder += '/' + _folder
                if not os.path.exists(_output_folder):
                    command = 'srmmkdir srm://t3se01.psi.ch/' + _output_folder
                    subprocess.check_call([command], shell=True)

    def dump_config(configs, output_file):
        """
        Dump all the configs in a output file
        Args:
            output_file: the file where the log will be dumped
            configs: list of files (string) to be dumped
        Returns:
            nothing
        """
        with open(output_file, 'w') as outf:
            for i in configs:
                try:
                    f = open(i, 'r')
                    outf.write(f.read())
                except:
                    print 'WARNING: Config' + i + ' not found. It will not be used.'

    def compile_macro(config, macro):
        """
        Creates the library from a macro using CINT compiling it in scratch to avoid
        problems with the linking in the working nodes.
        Args:
            config: configuration file where the macro path is specified
            macro: macro name to be compiled
        Returns:
            nothing
        """
        submitDir = os.getcwd()
        _macro = macro + '.h'
        library = parser.get(macro, 'library')
        libDir = os.path.dirname(library)
        os.chdir(libDir)
        if not os.path.exists(library):
            print 'INFO: Compiling ' + _macro
            scratchDir = '/scratch/%s/' % getpass.getuser()
            os.system(
                "cp " + libDir + '/* /scratch/%s/' % getpass.getuser()
            )  # OTHERWISE WILL NOT COMPILE SINCE INCLUDES OTHER FILES!!!
            os.chdir(scratchDir)
            print os.listdir(scratchDir)
            ROOT.gROOT.ProcessLine(
                '.L %s+' %
                (scratchDir +
                 _macro))  # CRASHES WHILE COMPILING THE SECOND ONE...
            shutil.copyfile(
                '/scratch/%s/%s' %
                (getpass.getuser(), os.path.basename(library)), library)
            print 'INFO: macro', macro, 'compiled, exiting to avoid stupid ROOT crash, please resubmit!!!'
            sys.exit(1)
        os.chdir(submitDir)

    print '===============================\n'
    print 'Compiling the macros'
    print '===============================\n'
    # compile_macro(config,'BTagReshaping')
    compile_macro(config, 'VHbbNameSpace')

    #check if the logPath exist. If not exit
    if not os.path.isdir(logPath):
        print '@ERROR : ' + logPath + ': dir not found.'
        print '@ERROR : Create it before submitting '
        print 'Exit'
        sys.exit(-1)

    # CREATE DICTIONARY TO BE USED AT JOB SUBMISSION TIME
    job_options = {
        'tag': args.tag,
        'logpath': logPath,
        'job': '',
        'task': args.task,
        'queue': 'all.q',
        'timestamp': timestamp,
        'bdt_params': '',
        'job_id': 'noid',
        'nprocesses': str(max(parser.getint('Configuration', 'nprocesses'), 1))
    }

    # STANDARD WORKFLOW SUBMISSION FUNCTION
    def submit(job, job_options, redirect_to_null=False):
        job_options['job'] = job
        global counter
        counter = 0
        counter += 1
        job_options['name'] = '%(job)s_%(tag)s%(task)s' % job_options
        if not run_locally:
            if whereToLaunch == 'lxplus':
                command = 'bsub -q 1nh -J {name} -o {logpath}/{timestamp}_{job}_{tag}_{task}.out runAll.sh {job} {tag} {task} {nprocesses} {job_id}'.format(
                    **job_options)
            else:
                command = 'qsub -V -cwd -q %(queue)s -l h_vmem=6G -N %(name)s -j y -o %(logpath)s/%(timestamp)s_%(job)s_%(tag)s_%(task)s.out -pe smp %(nprocesses)s runAll.sh %(job)s %(tag)s ' % (
                    job_options) + args.task + ' ' + job_options[
                        'nprocesses'] + ' ' + job_options[
                            'job_id'] + ' ' + job_options['bdt_params']
            print 'the command is ', command
            dump_config(
                configs,
                '%(logpath)s/%(timestamp)s_%(job)s_%(tag)s_%(task)s.config' %
                job_options)
            subprocess.check_call([command], shell=True)
        else:
            waiting_time_before_retry = 60
            number_symultaneous_process = 4
            counter = int(
                subprocess.check_output(
                    'ps aux | grep $USER | grep ' + args.task + ' | wc -l',
                    shell=True)) - 1  # add 1 to remove submithem count
            print 'counter command is', 'ps aux | grep $USER | grep ' + args.task + ' | wc -l'
            while counter > number_symultaneous_process:
                print 'counter is', counter
                print 'waiting', waiting_time_before_retry, 'seconds before to retry'
                os.system('sleep ' + str(waiting_time_before_retry))
                counter = int(
                    subprocess.check_output('ps aux | grep $USER | grep ' +
                                            args.task + ' | wc -l',
                                            shell=True))

            command = 'sh runAll.sh %(job)s %(tag)s ' % (
                job_options) + args.task + ' ' + job_options[
                    'nprocesses'] + ' ' + job_options[
                        'job_id'] + ' ' + job_options['bdt_params']
            if redirect_to_null:
                command = command + ' 2>&1 > /dev/null &'
            else:
                command = command + ' 2>&1 > %(logpath)s/%(timestamp)s_%(job)s_%(tag)s_%(task)s.out' % (
                    job_options) + ' &'
            print 'the command is ', command
            dump_config(
                configs,
                '%(logpath)s/%(timestamp)s_%(job)s_%(tag)s_%(task)s.config' %
                job_options)
            subprocess.check_call([command], shell=True)

    # SINGLE (i.e. FILE BY FILE) AND SPLITTED FILE WORKFLOW SUBMISSION FUNCTION
    def submitsinglefile(job, job_options, file, run_locally, counter_local):
        job_options['job'] = job
        global counter
        counter = 0
        counter += 1
        job_options['name'] = '%(job)s_%(tag)s%(task)s' % job_options
        if run_locally:
            command = 'sh runAll.sh %(job)s %(tag)s ' % (
                job_options) + args.task + ' ' + job_options[
                    'nprocesses'] + ' ' + job_options['job_id'] + ' ' + (
                        '0' if not job_options['bdt_params'] else
                        job_options['bdt_params'])
        elif whereToLaunch == 'lxplus':
            command = 'bsub -q 1nh -J {name} -o {logpath}/{timestamp}_{job}_{tag}_{task}.out runAll.sh {job} {tag} {task} {nprocesses} {job_id}'.format(
                **job_options)
        else:
            command = 'qsub -V -cwd -q %(queue)s -l h_vmem=6G -N %(name)s -j y -o %(logpath)s/%(timestamp)s_%(job)s_%(tag)s_%(task)s.out -pe smp %(nprocesses)s runAll.sh %(job)s %(tag)s ' % (
                job_options) + args.task + ' ' + job_options[
                    'nprocesses'] + ' ' + job_options['job_id'] + ' ' + (
                        '0' if not job_options['bdt_params'] else
                        job_options['bdt_params'])
            command = command.replace('.out',
                                      '_' + str(counter_local) + '.out')
        print 'the command is ', command
        print 'submitting', len(
            file.split(';')), 'files like', file.split(';')[0]
        command = command + ' "' + str(file) + '"'
        dump_config(
            configs,
            '%(logpath)s/%(timestamp)s_%(job)s_%(tag)s_%(task)s.config' %
            (job_options))
        subprocess.check_call([command], shell=True)

    # MERGING FUNCTION FOR SINGLE (i.e. FILE BY FILE) AND SPLITTED FILE WORKFLOW TO BE COMPATIBLE WITH THE OLD WORKFLOW
    def mergesubmitsinglefile(job, job_options, run_locally):
        job_options['job'] = job
        global counter
        counter = 0
        counter += 1
        job_options['name'] = '%(job)s_%(tag)s%(task)s' % job_options
        if run_locally:
            command = 'sh runAll.sh %(job)s %(tag)s ' % (
                job_options) + args.task + ' ' + job_options[
                    'nprocesses'] + ' ' + job_options['job_id'] + ' ' + (
                        '0' if not job_options['bdt_params'] else
                        job_options['bdt_params'])
        elif whereToLaunch == 'lxplus':
            command = 'bsub -q 1nh -J {name} -o {logpath}/{timestamp}_{job}_{tag}_{task}.out runAll.sh {job} {tag} {task} {nprocesses} {job_id}'.format(
                **job_options)
        else:
            command = 'qsub -V -cwd -q %(queue)s -l h_vmem=6G -N %(name)s -j y -o %(logpath)s/%(timestamp)s_%(job)s_%(tag)s_%(task)s.out -pe smp %(nprocesses)s runAll.sh %(job)s %(tag)s ' % (
                job_options) + args.task + ' ' + job_options[
                    'nprocesses'] + ' ' + job_options['job_id'] + ' ' + (
                        '0' if not job_options['bdt_params'] else
                        job_options['bdt_params'])
        command = command + ' mergeall'
        print 'the command is ', command
        dump_config(
            configs,
            '%(logpath)s/%(timestamp)s_%(job)s_%(tag)s_%(task)s.config' %
            (job_options))
        subprocess.check_call([command], shell=True)

    # RETRIEVE FILELIST FOR THE TREECOPIER PSI AND SINGLE FILE SYS STEPS
    def getfilelist(job):
        samplefiles = parser.get('Directories', 'samplefiles')
        filelist = myutils.filelist(samplefiles, job)
        return filelist

    if args.task == 'train':
        train_list = parser.get('MVALists', 'List_for_submitscript').split(',')
        print train_list
        for item in train_list:
            submit(item, job_options)

    if args.task == 'dc':
        DC_vars = parser.get('LimitGeneral', 'List').split(',')
        print DC_vars

    if args.task == 'plot':
        Plot_vars = parser.get('Plot_general', 'List').split(',')

    if not args.task == 'prep':
        path = parser.get('Directories', 'samplepath')
        info = myutils.ParseInfo(samplesinfo, path)

    if args.task == 'plot':
        job_options['queue'] = 'all.q'
        for item in Plot_vars:
            submit(item, job_options)

    if args.task == 'trainReg':
        job_options['queue'] = 'all.q'
        submit('trainReg', job_options)

    elif args.task == 'dc':
        job_options['queue'] = 'all.q'
        for item in DC_vars:
            # item here contains the dc name
            submit(item, job_options)

    elif args.task == 'prep':
        if not args.samples:
            path = parser.get('Directories', 'PREPin')
            info = myutils.ParseInfo(samplesinfo, path)
            for job in info:
                submit(job.name, job_options)
        else:
            for sample in args.samples:
                submit(sample, job_options)

    elif args.task in [
            'singleprep', 'singlesys', 'singleeval', 'mergesingleprep',
            'mergesinglesys', 'mergesingleeval'
    ]:
        if not args.samples:
            if args.task == 'singleprep' or args.task == 'mergesingleprep':
                path = parser.get('Directories', 'PREPin')
            elif args.task == 'singlesys' or args.task == 'mergesinglesys':
                path = parser.get('Directories', 'SYSin')
            elif args.task == 'singleeval' or args.task == 'mergesingleeval':
                path = parser.get('Directories', 'MVAin')
            info = myutils.ParseInfo(samplesinfo, path)
            sample_list = []
            for job in info:
                sample_list.append(job.identifier)
            sample_list = set(sample_list)
        else:
            sample_list = set(args.samples)

        for sample in sample_list:
            if sample == '':
                continue
            if args.task == 'singleprep' or args.task == 'singlesys' or args.task == 'singleeval':
                files = getfilelist(sample)
                files_per_job = args.nsplit if args.nsplit > 0 else parser.getint(
                    'Configuration', 'files_per_job')
                files_split = [
                    files[x:x + files_per_job]
                    for x in xrange(0, len(files), files_per_job)
                ]
                files_split = [';'.join(sublist) for sublist in files_split]
                counter_local = 0
                for files_sublist in files_split:
                    submitsinglefile(sample, job_options, files_sublist,
                                     run_locally, counter_local)
                    counter_local = counter_local + 1
            elif args.task == 'mergesingleprep' or args.task == 'mergesinglesys' or args.task == 'mergesingleeval':
                mergesubmitsinglefile(sample, job_options, run_locally)

    # ADD SYSTEMATIC UNCERTAINTIES AND ADDITIONAL HIGHER LEVEL VARIABLES TO THE TREES
    elif args.task == 'sys' or args.task == 'syseval':
        path = parser.get('Directories', 'SYSin')
        info = myutils.ParseInfo(samplesinfo, path)
        if not args.samples:
            for job in info:
                if job.subsample:
                    continue  # avoid multiple submissions form subsamples
                # TO FIX FOR SPLITTED SAMPLE
                submit(job.name, job_options)
        else:
            for sample in args.samples:
                submit(sample, job_options)

    # EVALUATION OF EVENT BY EVENT BDT SCORE
    elif args.task == 'eval':
        job_options['queue'] = 'long.q'
        path = parser.get('Directories', 'MVAin')
        info = myutils.ParseInfo(samplesinfo, path)
        if not args.samples:
            for job in info:
                if job.subsample:
                    continue  # avoid multiple submissions from subsamples
                if info.checkSplittedSampleName(
                        job.identifier
                ):  # if multiple entries for one name  (splitted samples) use the identifier to submit
                    print 'INFO: Splitted samples: submit through identifier'
                    submit(job.identifier, job_options)
                else:
                    submit(job.name, job_options)
        else:
            for sample in args.samples:
                print sample
                submit(sample, job_options)

    # POSSIBILITY TO SPLIT SINGLE MERGED FILES IN SUBFILES
    # IN PRINCIPLE USEFUL BUT NOT USED ANYMORE AS THE LOGIC CHANGED (I.E. DON'T MERGE FILES)
    elif args.task == 'split':
        path = parser.get('Directories', 'SPLITin')
        job_options['job_id'] = args.nsplit if args.nsplit > 0 else 100000
        info = myutils.ParseInfo(samplesinfo, path)
        if not args.samples:
            for job in info:
                if job.subsample:
                    continue  # avoid multiple submissions from subsamples
                submit(job.name, job_options)
        else:
            for sample in args.samples:
                submit(sample, job_options)

    # BDT optimisation
    elif args.task == 'mva_opt':
        total_number_of_steps = 1
        setting = ''
        for par in parser.get('Optimisation', 'parameters').split(','):
            scan_par = eval(parser.get('Optimisation', par))
            setting += par + '=' + str(scan_par[0]) + ':'
            if len(scan_par) > 1 and scan_par[2] != 0:
                total_number_of_steps += scan_par[2]
        #setting=setting[:-1] # eliminate last column at the end of the setting string
        #job_options['bdt_params']=setting
        job_options['bdt_params'] = 'main_par'
        job_options['job_id'] = parser.get('Optimisation', 'training')
        submit('OPT_main_set', job_options, False)
        main_setting = setting
        # Scanning all the parameters found in the training config in the Optimisation sector
        for par in parser.get('Optimisation', 'parameters').split(','):
            scan_par = eval(parser.get('Optimisation', par))
            if len(scan_par) > 1 and scan_par[2] != 0:
                for step in xrange(scan_par[2]):
                    value = scan_par[0] + (1 + step) * (
                        scan_par[1] - scan_par[0]) / scan_par[2]
                    setting = re.sub(par + '.*?:',
                                     par + '=' + str(value) + ':',
                                     main_setting)
                    job_options['bdt_params'] = setting
                    submit('OPT_' + par + str(value), job_options, False)
                    # submit(parser.get('Optimisation','training'),job_options)

    elif args.task == 'mva_opt_eval':
        #
        #This step evaluate the BDT produced by mva_opt.
        #

        #Read the config
        job_options['queue'] = 'long.q'
        path = parser.get('Directories', 'MVAin')
        job_options['job_id'] = parser.get('Optimisation', 'training')
        factoryname = parser.get('factory', 'factoryname')
        MVAdir = parser.get('Directories', 'vhbbpath') + '/python/weights/'
        #Read weights from optimisaiton config, store the in a list (copied from mva_opt)
        total_number_of_steps = 1
        setting = ''
        for par in parser.get('Optimisation', 'parameters').split(','):
            scan_par = eval(parser.get('Optimisation', par))
            setting += par + '=' + str(scan_par[0]) + ':'
            if len(scan_par) > 1 and scan_par[2] != 0:
                total_number_of_steps += scan_par[2]
        job_options['bdt_params'] = setting
        job_options['job_id'] = parser.get('Optimisation', 'training')
        main_setting = setting
        config_weights_list = ['OPT_main_set']
        for par in parser.get('Optimisation', 'parameters').split(','):
            scan_par = eval(parser.get('Optimisation', par))
            if len(scan_par) > 1 and scan_par[2] != 0:
                for step in xrange(scan_par[2]):
                    value = scan_par[0] + (1 + step) * (
                        scan_par[1] - scan_par[0]) / scan_par[2]
                    setting = re.sub(par + '.*?:',
                                     par + '=' + str(value) + ':',
                                     main_setting)
                    config_weights_list.append('OPT_' + par + str(value))
        #List all the weights produced from the optimisation, read from the weight directory. return weights_list
        weights = ''
        for cw in config_weights_list:
            for w in os.listdir(MVAdir):
                w = w.replace(factoryname + '_', '')
                w = w.replace('.root', '')
                if not w == cw:
                    continue
                weights += w + ','
        if weights[-1] == ',':
            weights = weights[:-1]  #remove , at the end of the list
        #submit the jobs
        info = myutils.ParseInfo(samplesinfo, path)
        job_options['bdt_params'] = weights
        print 'The optimisation weights are', weights
        if not args.samples:
            for job in info:
                if job.subsample:
                    continue  # avoid multiple submissions from subsamples
                if info.checkSplittedSampleName(
                        job.identifier
                ):  # if multiple entries for one name  (splitted samples) use the identifier to submit
                    print '@INFO: Splitted samples: submit through identifier'
                    submit(job.identifier, job_options)
                else:
                    submit(job.name, job_options)
        else:
            for sample in args.samples:
                print sample
                submit(sample, job_options)

    #Work in progress...
    elif args.task == 'mva_opt_dc':
        total_number_of_steps = 1
        setting = ''
        for par in parser.get('Optimisation', 'parameters').split(','):
            scan_par = eval(parser.get('Optimisation', par))
            setting += par + '=' + str(scan_par[0]) + ':'
            if len(scan_par) > 1 and scan_par[2] != 0:
                total_number_of_steps += scan_par[2]
        print setting
        job_options['bdt_params'] = 'OPT_main_set'
        dc = parser.get('Optimisation', 'dc')
        #Still need to launch main
        submit(dc, job_options, False)
        main_setting = setting
        # Scanning all the parameters found in the training config in the Optimisation sector
        for par in parser.get('Optimisation', 'parameters').split(','):
            scan_par = eval(parser.get('Optimisation', par))
            print par
            if len(scan_par) > 1 and scan_par[2] != 0:
                for step in xrange(scan_par[2]):
                    value = scan_par[0] + (1 + step) * (
                        scan_par[1] - scan_par[0]) / scan_par[2]
                    print value
                    job_options['bdt_params'] = 'OPT_' + par + str(value)
                    submit(dc, job_options, False)
                    print setting

    if args.philipp_love_progress_bars:
        os.system('qstat.py')