Python TrainDataGenerator Beispiele, DeepJetCore.dataPipeline.TrainDataGenerator Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: DeepJet_callbacks.py Projekt: schoef/DeepJetCore

    def __init__(
            self,
            samplefile,
            function_to_apply=None,  #needs to be function(counter,[model_input], [predict_output], [truth])
            after_n_batches=50,
            batchsize=10,
            on_epoch_end=False,
            use_event=0,
            decay_function=None,
            offset=0):
        super(PredictCallback, self).__init__()
        self.samplefile = samplefile
        self.function_to_apply = function_to_apply
        self.counter = 0
        self.call_counter = offset
        self.decay_function = decay_function

        self.after_n_batches = after_n_batches
        self.run_on_epoch_end = on_epoch_end

        if self.run_on_epoch_end and self.after_n_batches >= 0:
            print(
                'PredictCallback: can only be used on epoch end OR after n batches, falling back to epoch end'
            )
            self.after_n_batches = 0

        td = TrainData()
        td.readFromFile(samplefile)
        if use_event >= 0:
            td.skim(use_event)

        self.batchsize = 1
        self.td = td
        self.gen = TrainDataGenerator()
        self.gen.setBatchSize(batchsize)
        self.gen.setSkipTooLargeBatches(False)

Beispiel #2

0

Datei anzeigen

Datei: mergeOrSplitFiles.py Projekt: DL4Jets/DeepJetCore

from DeepJetCore.dataPipeline import TrainDataGenerator

infile=args.infile
nbatch=int(args.nelementsperfile)
randomise = args.randomise

dc = DataCollection(infile)
dc2 = DataCollection(infile)
samples = dc.samples

dir = dc.dataDir
if len(dir)<1:
    dir='.'
insamples = [dir+'/'+s for s in samples]

gen = TrainDataGenerator()
gen.setBatchSize(nbatch)
gen.setSkipTooLargeBatches(False)
gen.setFileList(insamples)

if randomise:
    gen.shuffleFileList()

nbatches = gen.getNBatches()

newsamples=[]
for i in range(nbatches):
    newname = str(samples[0][:-6]+"_n_"+str(i)+".djctd")
    newsamples.append(newname)
    ntd = gen.getBatch()
    print(newname)

Beispiel #3

0

Datei anzeigen

        use_inputdir=""
    outfilename = "pred_"+os.path.basename( inputfile )
    
    td = dc.dataclass()

    if inputfile[-5:] == 'djctd':
        if args.unbuffered:
            td.readFromFile(use_inputdir+"/"+inputfile)
        else:
            td.readFromFileBuffered(use_inputdir+"/"+inputfile)
    else:
        print('converting '+inputfile)
        td.readFromSourceFile(use_inputdir+"/"+inputfile, dc.weighterobjects, istraining=False)
    

    gen = TrainDataGenerator()
    if batchsize < 1:
        batchsize = dc.getBatchSize()
    print('batch size',batchsize)
    gen.setBatchSize(batchsize)
    gen.setSquaredElementsLimit(dc.batch_uses_sum_of_squares)
    gen.setSkipTooLargeBatches(False)
    gen.setBuffer(td)
    
    predicted = model.predict_generator(gen.feedNumpyData(),
                                        steps=gen.getNBatches(),
                                        max_queue_size=1,
                                        use_multiprocessing=False,verbose=1)
    
    
    x = td.transferFeatureListToNumpy(args.pad_rowsplits)

Beispiel #4

0

Datei anzeigen

def invokeGen(infile):
    if infile[-6:] == '.djcdc':
        dc = DataCollection(infile)
        td = dc.dataclass()
        tdclass = dc.dataclass
        dc.setBatchSize(1)
        gen = dc.invokeGenerator()
    elif infile[-6:] == '.djctd':
        td = TrainData_NanoML()
        tdclass = TrainData_NanoML
        td.readFromFile(infile)
        gen = TrainDataGenerator()
        gen.setBatchSize(1)
        gen.setBuffer(td)
    elif infile[-5:] == '.root':
        print('reading from root file')
        td = TrainData_NanoML()
        tdclass = TrainData_NanoML
        td.readFromSourceFile(infile,{},True)
        td.writeToFile(infile+'.djctd')
        td.readFromFile(infile+'.djctd')
        gen = TrainDataGenerator()
        gen.setBatchSize(1)
        gen.setBuffer(td)
        
    gen.setSkipTooLargeBatches(False)
    nevents = gen.getNBatches()
    gen.cast_to = tdclass
    return gen.feedTrainData,nevents,td

Beispiel #5

0

Datei anzeigen

Datei: DeepJet_callbacks.py Projekt: schoef/DeepJetCore

class PredictCallback(Callback):
    def __init__(
            self,
            samplefile,
            function_to_apply=None,  #needs to be function(counter,[model_input], [predict_output], [truth])
            after_n_batches=50,
            batchsize=10,
            on_epoch_end=False,
            use_event=0,
            decay_function=None,
            offset=0):
        super(PredictCallback, self).__init__()
        self.samplefile = samplefile
        self.function_to_apply = function_to_apply
        self.counter = 0
        self.call_counter = offset
        self.decay_function = decay_function

        self.after_n_batches = after_n_batches
        self.run_on_epoch_end = on_epoch_end

        if self.run_on_epoch_end and self.after_n_batches >= 0:
            print(
                'PredictCallback: can only be used on epoch end OR after n batches, falling back to epoch end'
            )
            self.after_n_batches = 0

        td = TrainData()
        td.readFromFile(samplefile)
        if use_event >= 0:
            td.skim(use_event)

        self.batchsize = 1
        self.td = td
        self.gen = TrainDataGenerator()
        self.gen.setBatchSize(batchsize)
        self.gen.setSkipTooLargeBatches(False)

    def reset(self):
        self.call_counter = 0

    def predict_and_call(self, counter):

        self.gen.setBuffer(self.td)

        predicted = self.model.predict_generator(self.gen.feedNumpyData(),
                                                 steps=self.gen.getNBatches(),
                                                 max_queue_size=1,
                                                 use_multiprocessing=False,
                                                 verbose=2)

        if not isinstance(predicted, list):
            predicted = [predicted]

        self.function_to_apply(self.call_counter,
                               self.td.copyFeatureListToNumpy(), predicted,
                               self.td.copyTruthListToNumpy())
        self.call_counter += 1

    def on_epoch_end(self, epoch, logs=None):
        self.counter = 0
        if not self.run_on_epoch_end: return
        self.predict_and_call(epoch)

    def on_batch_end(self, batch, logs=None):
        if self.after_n_batches <= 0: return
        self.counter += 1
        if self.counter > self.after_n_batches:
            self.counter = 0
            self.predict_and_call(batch)
            if self.decay_function is not None:
                self.after_n_batches = self.decay_function(self.call_counter)

Beispiel #6

0

Datei anzeigen

Datei: DataCollection.py Projekt: schoef/DeepJetCore

 def invokeGenerator(self):
     generator = TrainDataGenerator()
     generator.setBatchSize(self.__batchsize)
     generator.setSquaredElementsLimit(self.batch_uses_sum_of_squares)
     generator.setFileList([self.dataDir + "/" + s for s in self.samples])
     return generator

Beispiel #7

0

Datei anzeigen

from DeepJetCore.TrainData import TrainData
from DeepJetCore.dataPipeline import TrainDataGenerator
from LayersRagged import RaggedConstructTensor
import index_dicts
import tensorflow as tf
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

td = TrainData()
td.readFromFile(
    '/eos/cms/store/cmst3/group/hgcal/CMG_studies/pepr/50_part_with_noise_Jul2020/converted/HGCalML_data/50_part_with_noise_Jul2020/988_windowntup.djctd'
)
gen = TrainDataGenerator()
gen.setBatchSize(100000)
gen.setSkipTooLargeBatches(False)
gen.setBuffer(td)

with tf.device('/CPU:0'):
    ragged_constructor = RaggedConstructTensor()

while True:
    feat, truth = next(
        gen.feedNumpyData())  # this is  [ [features],[truth],[None] ]

    if gen.lastBatch():
        break

    row_splits = feat[1][:, 0]

Beispiel #8

0

Datei anzeigen

    def predict(self, model=None, model_path=None, output_to_file=True):
        if model_path == None:
            model_path = self.model_path

        if model is None:
            if not os.path.exists(model_path):
                raise FileNotFoundError('Model file not found')

        assert model_path is not None or model is not None

        outputs = []
        if output_to_file:
            os.system('mkdir -p ' + self.predict_dir)

        if model is None:
            model = load_model(model_path)

        all_data = []
        for inputfile in self.input_data_files:

            use_inputdir = self.inputdir
            if inputfile[0] == "/":
                use_inputdir = ""
            outfilename = "pred_" + os.path.basename(inputfile)

            print('predicting ', use_inputdir + '/' + inputfile)

            td = self.dc.dataclass()

            #also allows for inheriting classes now, like with tracks or special PU
            if not isinstance(td, TrainData_NanoML) and type(
                    td) is not TrainData_TrackML:
                raise RuntimeError(
                    "TODO: make sure this works for other traindata formats")

            if inputfile[-5:] == 'djctd':
                if self.unbuffered:
                    td.readFromFile(use_inputdir + "/" + inputfile)
                else:
                    td.readFromFileBuffered(use_inputdir + "/" + inputfile)
            else:
                print('converting ' + inputfile)
                td.readFromSourceFile(use_inputdir + "/" + inputfile,
                                      self.dc.weighterobjects,
                                      istraining=False)

            gen = TrainDataGenerator()
            # the batch size must be one otherwise we need to play tricks with the row splits later on
            gen.setBatchSize(1)
            gen.setSquaredElementsLimit(False)
            gen.setSkipTooLargeBatches(False)
            gen.setBuffer(td)

            num_steps = gen.getNBatches()
            generator = gen.feedNumpyData()

            dumping_data = []

            thistime = time.time()
            for _ in range(num_steps):
                data_in = next(generator)
                predictions_dict = model(data_in[0])
                for k in predictions_dict.keys():
                    predictions_dict[k] = predictions_dict[k].numpy()
                features_dict = td.createFeatureDict(data_in[0])
                truth_dict = td.createTruthDict(data_in[0])

                dumping_data.append(
                    [features_dict, truth_dict, predictions_dict])

            totaltime = time.time() - thistime
            print('took approx', totaltime / num_steps,
                  's per endcap (also includes dict building)')

            td.clear()
            gen.clear()
            outfilename = os.path.splitext(outfilename)[0] + '.bin.gz'
            if output_to_file:
                td.writeOutPredictionDict(dumping_data,
                                          self.predict_dir + "/" + outfilename)
            outputs.append(outfilename)
            if not output_to_file:
                all_data.append(dumping_data)

        if output_to_file:
            with open(self.predict_dir + "/outfiles.txt", "w") as f:
                for l in outputs:
                    f.write(l + '\n')

        if not output_to_file:
            return all_data

Beispiel #9

0

Datei anzeigen

    def __init__(self,
                 samplefile,
                 accumulate_after_batches=5,
                 plot_after_batches=50,
                 batchsize=10,
                 beta_threshold=0.6,
                 distance_threshold=0.6,
                 iou_threshold=0.1,
                 n_windows_for_plots=5,
                 n_windows_for_scalar_metrics=5000000,
                 outputdir=None,
                 publish = None,
                 n_ccoords=None,
                 n_average_over_samples=5,
                 ):
        """

        :param samplefile: the file to pick validation data from
        :param accumulate_after_batches: run performance metrics after n batches (a good value is 5)
        :param plot_after_batches: update and upload plots after n batches
        :param batchsize: batch size
        :param beta_threshold: beta threshold for running prediction on obc
        :param distance_threshold: distance threshold for running prediction on obc
        :param iou_threshold: iou threshold to use to match both for obc and for ticl
        :param n_windows_for_plots: how many windows to average to do running performance plots
        :param n_windows_for_scalar_metrics: the maximum windows to store data for scalar performance metrics as a function of iteration
        :param outputdir: the output directory where to store results
        :param publish: where to publish, could be ssh'able path
        :param n_ccoords: n coords for plots
        :param n_average_over_samples: average scalar metrics over samples
        """
        super(plotRunningPerformanceMetrics, self).__init__()
        self.samplefile = samplefile
        self.counter = 0
        self.call_counter = 0
        self.decay_function = None
        self.outputdir = outputdir
        self.n_ccords=n_ccoords
        self.publish=publish

        self.accumulate_after_batches = accumulate_after_batches
        self.plot_after_batches = plot_after_batches
        self.run_on_epoch_end = False

        if self.run_on_epoch_end and self.accumulate_after_batches >= 0:
            print('PredictCallback: can only be used on epoch end OR after n batches, falling back to epoch end')
            self.accumulate_after_batches = 0

        td = TrainData()
        td.readFromFile(samplefile)
        # td_selected = td.split(self.n_events)  # check if this works in ragged out of the box
        # if use_event >= 0:
        #     if use_event < td.nElements():
        #         td.skim(use_event)
        #     else:
        #         td.skim(use_event % td.nElements())
        self.batchsize = batchsize
        self.td = td
        self.gen = TrainDataGenerator()
        self.gen.setBatchSize(self.batchsize)
        self.gen.setSkipTooLargeBatches(False)
        self.gen.setBuffer(td)

        self.n_batches=self.gen.getNBatches()


        with tf.device('/CPU:0'):
            self.ragged_constructor = RaggedConstructTensor()
        self.window_id = 0
        self.window_analysis_dicts = []
        self.n_windows_for_plots = n_windows_for_plots
        self.n_windows_for_scalar_metrics = n_windows_for_scalar_metrics
        self.beta_threshold = beta_threshold
        self.distance_threshold = distance_threshold
        self.iou_threshold = iou_threshold

        self.scalar_metrics = dict()
        self.scalar_metrics['efficiency'] = []
        self.scalar_metrics['efficiency_ticl'] = []
        self.scalar_metrics['fake_rate'] = []
        self.scalar_metrics['fake_rate_ticl'] = []
        self.scalar_metrics['var_response'] = []
        self.scalar_metrics['var_response_ticl'] = []
        self.scalar_metrics['iteration'] = []

        self.n_average_over_samples = n_average_over_samples

        self.plot_process = None

Beispiel #10

0

Datei anzeigen

class plotRunningPerformanceMetrics(Callback):
    def __init__(self,
                 samplefile,
                 accumulate_after_batches=5,
                 plot_after_batches=50,
                 batchsize=10,
                 beta_threshold=0.6,
                 distance_threshold=0.6,
                 iou_threshold=0.1,
                 n_windows_for_plots=5,
                 n_windows_for_scalar_metrics=5000000,
                 outputdir=None,
                 publish = None,
                 n_ccoords=None,
                 n_average_over_samples=5,
                 ):
        """

        :param samplefile: the file to pick validation data from
        :param accumulate_after_batches: run performance metrics after n batches (a good value is 5)
        :param plot_after_batches: update and upload plots after n batches
        :param batchsize: batch size
        :param beta_threshold: beta threshold for running prediction on obc
        :param distance_threshold: distance threshold for running prediction on obc
        :param iou_threshold: iou threshold to use to match both for obc and for ticl
        :param n_windows_for_plots: how many windows to average to do running performance plots
        :param n_windows_for_scalar_metrics: the maximum windows to store data for scalar performance metrics as a function of iteration
        :param outputdir: the output directory where to store results
        :param publish: where to publish, could be ssh'able path
        :param n_ccoords: n coords for plots
        :param n_average_over_samples: average scalar metrics over samples
        """
        super(plotRunningPerformanceMetrics, self).__init__()
        self.samplefile = samplefile
        self.counter = 0
        self.call_counter = 0
        self.decay_function = None
        self.outputdir = outputdir
        self.n_ccords=n_ccoords
        self.publish=publish

        self.accumulate_after_batches = accumulate_after_batches
        self.plot_after_batches = plot_after_batches
        self.run_on_epoch_end = False

        if self.run_on_epoch_end and self.accumulate_after_batches >= 0:
            print('PredictCallback: can only be used on epoch end OR after n batches, falling back to epoch end')
            self.accumulate_after_batches = 0

        td = TrainData()
        td.readFromFile(samplefile)
        # td_selected = td.split(self.n_events)  # check if this works in ragged out of the box
        # if use_event >= 0:
        #     if use_event < td.nElements():
        #         td.skim(use_event)
        #     else:
        #         td.skim(use_event % td.nElements())
        self.batchsize = batchsize
        self.td = td
        self.gen = TrainDataGenerator()
        self.gen.setBatchSize(self.batchsize)
        self.gen.setSkipTooLargeBatches(False)
        self.gen.setBuffer(td)

        self.n_batches=self.gen.getNBatches()


        with tf.device('/CPU:0'):
            self.ragged_constructor = RaggedConstructTensor()
        self.window_id = 0
        self.window_analysis_dicts = []
        self.n_windows_for_plots = n_windows_for_plots
        self.n_windows_for_scalar_metrics = n_windows_for_scalar_metrics
        self.beta_threshold = beta_threshold
        self.distance_threshold = distance_threshold
        self.iou_threshold = iou_threshold

        self.scalar_metrics = dict()
        self.scalar_metrics['efficiency'] = []
        self.scalar_metrics['efficiency_ticl'] = []
        self.scalar_metrics['fake_rate'] = []
        self.scalar_metrics['fake_rate_ticl'] = []
        self.scalar_metrics['var_response'] = []
        self.scalar_metrics['var_response_ticl'] = []
        self.scalar_metrics['iteration'] = []

        self.n_average_over_samples = n_average_over_samples

        self.plot_process = None


    def reset(self):
        self.call_counter = 0

    def predict_and_call(self, counter):
        feat, truth = next(self.gen.feedNumpyData())  # this is  [ [features],[truth],[None] ]

        if self.gen.lastBatch():
            self.gen.setBuffer(self.td)
            # self.gen.prepareNextEpoch()

        def dummy_gen():
            yield (feat, truth)

        predicted = self.model.predict_generator(dummy_gen(),
                                                 steps=1,
                                                 max_queue_size=1,
                                                 use_multiprocessing=False,
                                                 verbose=2)

        self.accumulate(self.counter, feat,
                               predicted, truth)
        self.call_counter += 1

    def on_epoch_end(self, epoch, logs=None):
        self.counter = 0
        if not self.run_on_epoch_end: return
        self.predict_and_call(epoch)

    def on_batch_end(self, batch, logs=None):
        if self.accumulate_after_batches <= 0: return
        if self.counter % self.accumulate_after_batches == 0:
            self.predict_and_call(batch)
        if self.plot_after_batches > 0:
            if self.counter % self.plot_after_batches == 0:
                self.plot()
        self.counter += 1

    def plot(self):
        if self.plot_process is not None:
            self.plot_process.join()

        self.plot_process = Process(target=self._plot, args=(copy.deepcopy(self.window_analysis_dicts), copy.deepcopy(self.scalar_metrics)))
        self.plot_process.start()

    def _plot(self, window_analysis_dicts, scalar_metrics):
        with tf.device('/CPU:0'):
            if len(window_analysis_dicts) == self.n_windows_for_plots:
                print("Plotting and publishing")
                dataset_analysis_dict = build_dataset_analysis_dict()
                dataset_analysis_dict['beta_threshold'] = self.beta_threshold
                dataset_analysis_dict['distance_threshold'] = self.distance_threshold
                dataset_analysis_dict['iou_threshold'] = self.iou_threshold
                for x in window_analysis_dicts:
                    dataset_analysis_dict = append_window_dict_to_dataset_dict(dataset_analysis_dict, x)

                make_running_plots(self.outputdir, dataset_analysis_dict, scalar_metrics, self.n_average_over_samples, get_analysis_plotting_configuration('standard_hgcal_with_ticl'))

                if self.publish is not None:
                    for f in os.listdir(self.outputdir):
                        if f.endswith('.png'):
                            f_full = os.path.join(self.outputdir, f)
                            cpstring = 'cp -f '
                            if "@" in self.publish:
                                cpstring = 'scp '
                            s = (cpstring + f_full + ' ' + self.publish + f+' > /dev/null')
                            os.system(s)

    def accumulate(self, counter, feat, predicted, truth):
        print("Accumulating")

        with tf.device('/CPU:0'):
            new_window_analysis_dicts = self.analyse_one_file(feat, predicted, truth)
            self.window_analysis_dicts += new_window_analysis_dicts

            for i, wdict in enumerate(new_window_analysis_dicts):
                efficiency = float(wdict['window_num_found_showers']) / wdict['window_num_truth_showers']
                efficiency_ticl = float(wdict['window_num_found_showers_ticl']) / wdict['window_num_truth_showers']

                fake_rate = float(wdict['window_num_fake_showers']) / wdict['window_num_pred_showers']
                fake_rate_ticl = float(wdict['window_num_fake_showers_ticl']) / wdict['window_num_ticl_showers']

                truth_shower_energy = np.array(wdict['truth_shower_energy'])
                pred_shower_energy = np.array(wdict['truth_shower_matched_energy_regressed'])
                ticl_shower_energy = np.array(wdict['truth_shower_matched_energy_regressed_ticl'])

                filter = pred_shower_energy!=-1
                filter_ticl = ticl_shower_energy!=-1

                var_res = pred_shower_energy[filter]/truth_shower_energy[filter]
                var_res = np.std(var_res) / np.mean(var_res)
                var_res_ticl = ticl_shower_energy[filter_ticl] / truth_shower_energy[filter_ticl]
                var_res_ticl = np.std(var_res_ticl) / np.mean(var_res_ticl)

                iteration = counter + float((i +1)) / float(len(new_window_analysis_dicts))

                self.scalar_metrics['efficiency'].append(efficiency)
                self.scalar_metrics['efficiency_ticl'].append(efficiency_ticl)
                self.scalar_metrics['fake_rate'].append(fake_rate)
                self.scalar_metrics['fake_rate_ticl'].append(fake_rate_ticl)
                self.scalar_metrics['var_response'].append(var_res)
                self.scalar_metrics['var_response_ticl'].append(var_res_ticl)
                self.scalar_metrics['iteration'].append(iteration)

            while len(self.window_analysis_dicts) > self.n_windows_for_plots:
                self.window_analysis_dicts.pop(0)

            while len(self.scalar_metrics['iteration']) > self.n_windows_for_scalar_metrics:
                self.n_windows_for_scalar_metrics.pop(0)


    def analyse_one_file(self, _features, predictions, truth_in, soft=False):
        predictions = tf.constant(predictions)

        row_splits = _features[1][:, 0]

        features, _ = self.ragged_constructor((_features[0], row_splits))
        truth, row_splits = self.ragged_constructor((_features[2], row_splits))

        hit_assigned_truth_id = truth[:, 0:1]

        # make 100% sure the cast doesn't hit the fan
        hit_assigned_truth_id = tf.where(hit_assigned_truth_id < -0.1, hit_assigned_truth_id - 0.1,
                                         hit_assigned_truth_id + 0.1)
        hit_assigned_truth_id = tf.cast(hit_assigned_truth_id[:, 0], tf.int32)

        window_analysis_dicts = []
        for i in range(len(row_splits) - 1):
            hit_assigned_truth_id_s = hit_assigned_truth_id[row_splits[i]:row_splits[i + 1]].numpy()
            features_s = features[row_splits[i]:row_splits[i + 1]].numpy()
            truth_s = truth[row_splits[i]:row_splits[i + 1]].numpy()
            prediction_s = predictions[row_splits[i]:row_splits[i + 1]].numpy()

            window_analysis_dict = analyse_one_window_cut(hit_assigned_truth_id_s, features_s, truth_s,
                                                          prediction_s,
                                                          self.beta_threshold, self.distance_threshold, self.iou_threshold, self.window_id, False,
                                                          soft=soft)

            window_analysis_dicts.append(window_analysis_dict)

            # append_window_dict_to_dataset_dict(dataset_analysis_dict, window_analysis_dict)
            # num_visualized_segments += 1
            self.window_id += 1

        return window_analysis_dicts