Esempio n. 1
0
    def run(self): 

        reduced_dataset = self._reduced_dataset
        working_dir = self._working_dir
        training_variables = self._training_variables
        do_test = self._do_test

        # --- profiling 
        profile_dir = os.path.join(working_dir,'profile')
        if not os.path.isdir(profile_dir): 
            os.mkdir(profile_dir)

        profile_file = os.path.join(profile_dir, 'profiled.root')
        if not os.path.isfile(profile_file):
            rds_dir = os.path.split(reduced_dataset)[0]
            alt_profile_file = os.path.join(rds_dir, 'profiled.root')
            if os.path.isfile(alt_profile_file): 
                profile_file = alt_profile_file

        mean_rms_file = os.path.join(profile_dir, 'mean_rms.txt')
        if not do_test: 
            if not os.path.isfile(mean_rms_file): 
                if not os.path.isfile(profile_file): 
                    print '--- making profile file for normalization ---'
                    profile.make_profile_file(reduced_dataset, profile_file)
            
                profile.build_mean_rms_from_profile(
                    profile_file = profile_file, 
                    text_file_name = mean_rms_file)

        # --- training part 
        training_dir = os.path.join(working_dir,self._training_subdir)
        if not os.path.isdir(training_dir): 
            os.mkdir(training_dir)
    
        normalization_file = os.path.join(training_dir, 'normalization.txt')
        if not os.path.isfile(normalization_file) and not do_test: 
            if not os.path.isfile(profile_file): 
                profile.make_profile_file(reduced_dataset, profile_file)
    
            profile.make_normalization_file(
                profile_file, 
                normalization_file = normalization_file, 
                whitelist = training_variables)
                                            
        normalization_dict = {}
        if os.path.isfile(normalization_file): 
            with open(normalization_file) as norm_file: 
                for line in norm_file: 
                    line = line.strip()
                    if not line: continue
                    name = line.split()[0]
                    offset, scale = (float(n) for n in line.split()[1:])
                    normalization_dict[name] = (offset, scale)
    
        print 'normalization:'
        text_size = max(len(x) for x in normalization_dict.keys()) + 1
        for value, (offset, scale) in normalization_dict.iteritems(): 
            print '%-*s offset: % -10.4g scale: % -10.4g' % (
                text_size, value, offset, scale)
    
        weights_path = os.path.join(training_dir, 'weightMinimum.root')
        if not os.path.isfile(weights_path): 
            print '--- running training ---'
            training.run_training(reduced_dataset = reduced_dataset, 
                                  output_directory = training_dir, 
                                  normalization = normalization_dict, 
                                  flavor_weights = self._flavor_weights, 
                                  nodes = self._nodes, 
                                  debug = do_test, 
                                  events = self._n_training_events, 
                                  other_opt_dict = self._other_opt_dict)
    
        # --- diagnostics part 
        testing_dir = os.path.join(working_dir, self._testing_subdir)
        if not os.path.isdir(testing_dir): 
            os.mkdir(testing_dir)
    

        if not self._testing_ds: 
            self._testing_ds = reduced_dataset

        augmented_tree = os.path.join(testing_dir, 'perf_ntuple.root') 
        if not os.path.isfile(augmented_tree): 
            print '--- augmenting reduced dataset with nn classifiers ---'
            # should wrap this in a function to close the file when done
            
            from ROOT import TFile
            testing_ds_file = TFile(self._testing_ds)

            the_tree = testing_ds_file.Get('SVTree')
            all_vars_in_tree = utils.get_leaves_in_tree(the_tree)

            # --- filter out branches we don't care about
            output_subset = ['bottom','light','charm','JetPt','JetEta']
            subset_regex = '|'.join([
                    '^discriminator(?!Jet)', 
                    '^log[BCU][bcu]', 
                ])
            branch_filter = re.compile(subset_regex)
            for branch in itertools.chain(*all_vars_in_tree.values()): 
                if branch_filter.findall(branch): 
                    output_subset.append(branch)

            from jetnet import pynn
            pynn.augment_tree(
                in_file = self._testing_ds, 
                nn_file = weights_path, 
                out_file = augmented_tree, 
                ints = all_vars_in_tree['Int_t'], 
                doubles = all_vars_in_tree['Double_t'], 
                extension = self._augment_extension , 
                subset = output_subset, 
                show_progress = True) 

        profiled_path = os.path.splitext(augmented_tree)[0] + '_profile.root'
        
        if not os.path.isfile(profiled_path): 
            print '--- profiling performance ntuple ---'
            profile.make_profile_file(reduced_dataset = augmented_tree, 
                                      profile_file = profiled_path)

        output_paths = {
            'profile': profiled_path, 
            'perf_ntuple': augmented_tree, 
            }


        if self._do_more_diagnostics is not None: 
            warn("do_more_diagnostics doesn't do anything now, please remove", 
                 SyntaxWarning, 
                 stacklevel = 5, 
                 # stacklevel 5 needed to get through the multiprocess call
                 ) 
                 

        self.out_queue.put(output_paths)
from jetnet import pynn, utils
import sys, argparse

from ROOT import TFile

if __name__ == '__main__': 
    if not ( 3 <= len(sys.argv) <= 4 ): 
        sys.exit('motheruker: <in file> <nn file> [<output file>]')

    input_file = sys.argv[1]
    nn_file = sys.argv[2]
    try: 
        output_file = sys.argv[3]
    except IndexError: 
        output_file = '.'.join(input_file.split('.')[:-1]) + '_aug.root'
        print 'making', output_file
    tree = 'SVTree'

    the_file = TFile(input_file)
    the_tree = the_file.Get(tree)
    all_vars_in_tree = utils.get_leaves_in_tree(the_tree)
    
    pynn.augment_tree(
        in_file = input_file, 
        nn_file = nn_file, 
        out_file = output_file, 
        ints = all_vars_in_tree['Int_t'] , 
        doubles = all_vars_in_tree['Double_t'] , 
        )