def build_rds(working_dir, reduced_dir, reduced_dataset, input_files, 
              jet_collection, check_file, do_test): 
    if not os.path.isdir(working_dir): 
        os.mkdir(working_dir)

    # --- rds part
    if not os.path.isdir(reduced_dir): 
        os.mkdir(reduced_dir)

    if not os.path.isfile(reduced_dataset): 
        double_variables, int_variables = rds.get_allowed_rds_variables(
            input_files = input_files, 
            jet_collection = jet_collection)


        pyprep.prep_ntuple(input_files = input_files, 
                           double_variables = double_variables, 
                           int_variables = int_variables, 
                           observer_discriminators = observer_discriminators, 
                           jet_collection = jet_collection, 
                           output_file = reduced_dataset, 
                           debug = do_test)
        
    profile_file = os.path.join(reduced_dir, 'profiled.root')
    if not os.path.isfile(profile_file): 
        profile.make_profile_file(reduced_dataset, profile_file)
    
    # mark as done
    open(check_file,'w').close()
def train_and_test(input_files, 
                   config_file, 
                   working_dir = None, 
                   do_test = False, 
                   ): 


    config = SafeConfigParser()
    config.read(config_file)

    # --- setup preprocessing
    preproc = dict(config.items('preprocessing'))
    jet_collection = preproc['jet_collection']


    pt_divisions = [float(x) for x in preproc['pt_divisions'].split() ]
    observer_discriminators = preproc['observer_discriminators'].split()

    # --- early load of post-training options  
    training_opts = dict(config.items('training'))
    testing_opts = dict(config.items('testing'))
    training_variables = training_opts['variables'].split()

    testing_dataset = None

    if 'testing_dataset' in testing_opts: 
        testing_dataset = testing_opts['testing_dataset']

    # --- change some things if this is an array job
    jet_tagger = preproc['jet_tagger']
    if 'ARRAYID' in jet_tagger: 
        the_array_id = os.environ['PBS_ARRAYID'].rjust(2,'0')
        jet_tagger = jet_tagger.replace('ARRAYID',the_array_id)
        working_dir = jet_tagger
        if testing_dataset: 
            testing_dataset = os.path.join(working_dir,testing_dataset)

    if testing_dataset and not os.path.isfile(testing_dataset): 
        raise IOError('{} not found'.format(testing_dataset))

    flavor_weights = {}
    if config.has_section('weights'): 
        warn('moving [weights] contents into [training] section', 
             FutureWarning)
        flavor_weights = dict( config.items('weights') )
        for wt_name, wt in flavor_weights.items(): 
            config.set('training', wt_name + '_wt', wt)
        config.remove_section('weights')
        with open(config_file_name,'w') as new_cfg: 
            config.write(new_cfg)

    flavors = ['bottom','charm','light']
    flavor_weights = { 
        f : config.get('training', f + '_wt') for f in flavors
        }
    for f in flavor_weights: 
        flavor_weights[f] = float(flavor_weights[f])


    # --- setup the working directory 
    if not working_dir: 
        working_dir = jet_collection
    if not os.path.isdir(working_dir): 
        os.mkdir(working_dir)

    # --- hold here if someone else is working 
    hold_job(working_dir)
    set_hold(working_dir)

    # --- rds part
    rds_name = 'reduced_dataset.root'
    # get weights file 
    rds_dir = os.path.join(working_dir, 'reduced')
    if not os.path.isdir(rds_dir): 
        os.mkdir(rds_dir)

    rds_path = os.path.join(rds_dir, rds_name )
    if not testing_dataset: 
        testing_dataset = rds_path

    weight_file = os.path.join(rds_dir, 'weights.root')
    if not os.path.isfile(weight_file): 
        
        # build a light ntuple if one doesn't exist
        if os.path.isfile(rds_path): 
            small_rds_path = rds_path 

        else: 
            print '--- making flat ntuple to build weight file ---'
            small_rds = 'small_rds.root'
            small_rds_path = os.path.join(rds_dir,small_rds)
            if not os.path.isfile(small_rds_path): 
                pyprep.make_flat_ntuple(
                    input_files = input_files, 
                    jet_collection = jet_collection, 
                    jet_tagger = jet_tagger, 
                    output_file = small_rds_path)
            
        pt_low, pt_high = (15.0, 300)
        log_span = log(pt_high) - log(pt_low)
        log_range = [log(pt_low) + i * log_span / 10 for i in xrange(11)]
        pt_bins = [exp(x) for x in log_range]

        print '--- making weight file ---'
        from jetnet import cxxprofile
        cxxprofile.pro2d(
            in_file = small_rds_path, 
            tree = 'SVTree', 
            plots = [( ('JetPt', pt_bins),
                       ('JetEta',10,-2.5,2.5) )], 
            tags = ['bottom','charm','light'], 
            out_file = weight_file, 
            show_progress = True)


    double_variables, int_variables = rds.get_allowed_rds_variables(
        input_files = input_files, 
        full_dir_name = jet_collection + '_' + jet_tagger)


    if not os.path.isfile(rds_path): 
        print '--- making flattened dataset for training ---'
        flags = 'hr' if not do_test else 'd'
        pyprep.make_flat_ntuple(
            input_files = input_files, 
            weight_file = weight_file, 
            double_variables = double_variables, 
            int_variables = int_variables, 
            observer_discriminators = observer_discriminators, 
            pt_divisions = pt_divisions, 
            jet_collection = jet_collection, 
            jet_tagger = jet_tagger, 
            output_file = rds_path, 
            flags = flags, 
            )

    # --- unset other job hold 
    set_hold(working_dir, value = False)

    proc = process.RDSProcess(
        reduced_dataset = rds_path, 
        working_dir = working_dir, 
        training_variables = training_variables, 
        flavor_weights = flavor_weights, 
        testing_dataset = testing_dataset, 
        do_test = do_test, 
        config_file = config_file)
    proc.start()
    proc.join()
    proc_outputs = proc.out_queue.get(block = False)


    # --- make the summary folder 

    working_dir_list = working_dir.split('/')[:-1]
    if not working_dir_list: 
        summary_dir = 'summary'
    else: 
        working_dir_parent = os.path.join(*working_dir_list)
        summary_dir = os.path.join(working_dir_parent,'summary')

    if not os.path.isdir(summary_dir): 
        os.mkdir(summary_dir)

    summary_base_name, cfg_ext = os.path.splitext(config_file)
    if 'PBS_ARRAYID' in os.environ: 
        summary_base_name += '_subjob{}'.format(os.environ['PBS_ARRAYID'])

    if 'profile' in proc_outputs: 
        profile_summary_name = summary_base_name + '_profile.root'
        profile_summary_path = os.path.join(summary_dir,profile_summary_name)
        shutil.copyfile(proc_outputs['profile'], profile_summary_path)


    this_config_name = summary_base_name + cfg_ext
    this_config_path = os.path.join(summary_dir, this_config_name)
    shutil.copyfile(config_file, this_config_path)
def make_flat_ntuple(
    input_files, 
    pt_divisions, 
    weight_file = '', 
    jet_collection = 'BTag_AntiKt4TopoEMJetsReTagged', 
    jet_tagger = 'JetFitterCharm', 
    output_path = None, 
    rds_path = 'reduced_dataset.root', 
    observer_discriminators = _default_observers, 
    do_test = False, 
    skim_function = pyprep.make_flat_ntuple, 
    ): 

    double_variables, int_variables = rds.get_allowed_rds_variables(
        input_files = input_files, 
        full_dir_name = '_'.join([jet_collection,jet_tagger]))

    # --- make weights if a name is given 
    if weight_file and not os.path.isfile(weight_file): 
        
        # build a light ntuple if one doesn't exist
        if os.path.isfile(rds_path): 
            small_rds_path = rds_path 

        else: 
            print 'making flat ntuple to build weight file'

            rds_dir, rds_name = os.path.split(rds_path)
            small_rds = '.'.join(rds_name.split('.')[:-1]) + '_small.root'
            small_rds_path = os.path.join(rds_dir,small_rds)
            if not os.path.isfile(small_rds_path): 
                pyprep.make_flat_ntuple(
                    input_files = input_files, 
                    jet_collection = jet_collection, 
                    jet_tagger = jet_tagger, 
                    output_file = small_rds_path)
            
        pt_low, pt_high = (15.0, 250.0)
        log_span = log(pt_high) - log(pt_low)
        log_range = [log(pt_low) + i * log_span / 10 for i in xrange(11)]
        pt_bins = [exp(x) for x in log_range]

        from jetnet import cxxprofile
        cxxprofile.pro2d(
            in_file = small_rds_path, 
            tree = 'SVTree', 
            plots = [( ('JetPt', 30,15.0,200),
                       ('JetEta',10,-2.5,2.5) )], 
            tags = ['bottom','charm','light'], 
            out_file = weight_file, 
            show_progress = True)

    # --- rds part

    rds_dir, rds_file = os.path.split(rds_path)
    if rds_dir and not os.path.isdir(rds_dir): 
        os.mkdir(rds_dir)

    if os.path.isfile(rds_path): 
        raise IOError(
            "{} already exists, refusing to overwrite".format(rds_path) )
    else: 
        skim_function(
            input_files = input_files, 
            weight_file = weight_file, 
            double_variables = double_variables, 
            int_variables = int_variables, 
            observer_discriminators = observer_discriminators, 
            pt_divisions = pt_divisions, 
            jet_collection = jet_collection, 
            jet_tagger = jet_tagger, 
            output_file = rds_path, 
            debug = do_test, 
            )
def run_full_chain_by_pt(
    input_files, 
    working_dir = None, 
    output_path = None, 
    rds_dir = 'reduced_pt', 
    jet_collection = 'AntiKt4TopoEMJets', 
    do_test = False, 
    training_variables = training_variable_whitelist, 
    pt_divisions = default_pt_divisions, 
    flavor_weights = {}, 
    cram = False, 
    sequential = False): 

    
    if working_dir is None: 
        working_dir = jet_collection

    if not os.path.isdir(working_dir): 
        os.mkdir(working_dir)

    # --- rds part
    reduced_dir = os.path.join(working_dir, rds_dir)
    if not os.path.isdir(reduced_dir): 
        os.mkdir(reduced_dir)

    reduced_datasets = glob.glob('%s/reduced_*' % reduced_dir)
    if len(reduced_datasets) == 0: 
        double_variables, int_variables = rds.get_allowed_rds_variables(
            input_files = input_files, jet_collection = jet_collection)

        pyprep.make_ntuples_ptcat(
            input_files = input_files, 
            double_variables = double_variables, 
            int_variables = int_variables, 
            observer_discriminators = observer_discriminators, 
            pt_divisions = [float(pt) for pt in pt_divisions], 
            jet_collection = jet_collection, 
            output_dir = reduced_dir, 
            debug = do_test )
    
    reduced_datasets = glob.glob('%s/reduced_*' % reduced_dir)

    n_processors = multiprocessing.cpu_count()
    # -- allow one less cpu than process, 
    #    the low bin doesn't run anyway 
    if n_processors < len(reduced_datasets) - 1: 
        print 'WARNING: not enough processors for these subjobs '
        'want %i, found %i' % (len(reduced_datasets), n_processors)
        if not cram and not sequential: 
            sys.exit('quitting...')
            
        
    subprocesses = []
    for ds in reduced_datasets: 
        rds_basename = os.path.basename(ds).rstrip('.root')
        category = rds_basename.lstrip('reduced_')
        working_subdir = os.path.join(working_dir,'pt_' + category)
        if not os.path.isdir(working_subdir): 
            os.mkdir(working_subdir)

        proc = process.RDSProcess(
            reduced_dataset = ds, 
            working_dir = working_subdir, 
            training_variables = training_variables, 
            flavor_weights = flavor_weights, 
            do_test = do_test)
        proc.start()
        subprocesses.append(proc)
        if sequential: 
            proc.join()

    for proc in subprocesses: 
        proc.join()

    return 0