def merge_successful(): output_dir=get_output_dir() from DeepJetCore.DataCollection import DataCollection alldc=[] for s in successful: in_path=output_dir+'/conversion.'+str(s)+'.dc' dc=None try: dc=DataCollection(in_path) except Exception as e: print('problems adding '+in_path+" will continue nevertheless... (error see below)") print(e) dc=0 if dc: alldc.append(DataCollection(in_path)) print("merging DataCollections") merged = alldc[0] merged_c=1 for i in range(1,len(alldc)): try: merged += alldc[i] merged_c+=1 except Exception as e: print(e) print('...continue adding nevertheless') if merged_c != len(alldc): print('lost '+str(100* (1. - float(merged_c)/float(len(alldc)))) +'%') print("saving merged DataCollection") merged.writeToFile('%s/dataCollection.dc' % output_dir) print('successfully merged to %s/dataCollection.dc' % output_dir) return merged
def __init__( self, directory ): filename = os.path.join( directory, 'dataCollection.dc') file_ = open( filename, 'rb') self.samples = pickle.load(file_) sampleentries = pickle.load(file_) originRoots = pickle.load(file_) nsamples = pickle.load(file_) useweights = pickle.load(file_) batchsize = pickle.load(file_) dataclass = pickle.load(file_) weighter = pickle.load(file_) self._means = pickle.load(file_) file_.close() # Get means dictionary self.means = {name : (self._means[0][i], self._means[1][i]) for i, name in enumerate( self._means.dtype.names) } # Get DeepJetCore DataCollection self.dataCollection = DataCollection() self.dataCollection.readFromFile(filename) # Reading first sample & get branch structure fullpath = self.dataCollection.getSamplePath(self.samples[0]) self.dataCollection.dataclass.readIn(fullpath) self.branches = self.dataCollection.dataclass.branches print "Branches:" for i in range(len(self.branches)): print "Collection", i for i_b, b in enumerate(self.branches[i]): print " branch %2i/%2i %40s mean %8.5f var %8.5f" %( i, i_b, b, self.means[b][0], self.means[b][1]) print
def loadModel(inputDir, trainData, model, LoadModel, sampleDatasets=None, removedVars=None): inputModel = '%s/KERAS_check_best_model.h5' % inputDir # inputModel = '%s/KERAS_model.h5'%inputDir inputWeights = '%s/KERAS_check_best_model_weights.h5' % inputDir from DeepJetCore.DataCollection import DataCollection traind = DataCollection() traind.readFromFile(trainData) traind.dataclass.regressiontargetclasses = range(0, NBINS) print traind.getNRegressionTargets() if (LoadModel): evalModel = load_model(inputModel, custom_objects=global_loss_list) shapes = traind.getInputShapes() else: shapes = traind.getInputShapes() train_inputs = [] for s in shapes: train_inputs.append(keras.layers.Input(shape=s)) evalModel = model(train_inputs, traind.getNClassificationTargets(), traind.getNRegressionTargets(), sampleDatasets, removedVars) evalModel.load_weights(inputWeights) return evalModel
def __init__(self, input_source_files_list, training_data_collection, predict_dir, unbuffered=False, model_path=None, max_files=4, inputdir=None): self.input_data_files = [] self.inputdir = None self.predict_dir = predict_dir self.unbuffered = unbuffered self.max_files = max_files print("Using HGCal predictor class") ## prepare input lists for different file formats if input_source_files_list[-6:] == ".djcdc": print('reading from data collection', input_source_files_list) predsamples = DataCollection(input_source_files_list) self.inputdir = predsamples.dataDir for s in predsamples.samples: self.input_data_files.append(s) elif input_source_files_list[-6:] == ".djctd": self.inputdir = os.path.abspath( os.path.dirname(input_source_files_list)) infile = os.path.basename(input_source_files_list) self.input_data_files.append(infile) else: print('reading from text file', input_source_files_list) self.inputdir = os.path.abspath( os.path.dirname(input_source_files_list)) with open(input_source_files_list, "r") as f: for s in f: self.input_data_files.append( s.replace('\n', '').replace(" ", "")) self.dc = None if input_source_files_list[ -6:] == ".djcdc" and not training_data_collection[ -6:] == ".djcdc": self.dc = DataCollection(input_source_files_list) else: self.dc = DataCollection(training_data_collection) if inputdir is not None: self.inputdir = inputdir self.model_path = model_path if max_files > 0: self.input_data_files = self.input_data_files[ 0:min(max_files, len(self.input_data_files))]
def sumDCandWrite(filelist, outname): alldc = [] for f in filelist: dc = DataCollection(f) alldc.append(dc) rel = os.path.relpath(dc.dataDir, os.getcwd()) dc.prependToSampleFiles(rel + '/') dc.dataDir = os.getcwd() merged = sum(alldc) print(outname) merged.writeToFile(outname)
def sumDCandWrite(filelist, outname): alldc = [] for f in filelist: try: dc = DataCollection(f) except: print('read in of ' + f + ' not working, skip') continue alldc.append(dc) rel = os.path.relpath(dc.dataDir, os.getcwd()) dc.prependToSampleFiles(rel + '/') dc.dataDir = os.getcwd() merged = sum(alldc) print(outname) merged.writeToFile(outname)
def test(self): passed = True dc = DataCollection() dc.dataclass = TrainData_test dc.sourceList = [f for f in self.files.filenames] dc.createDataFromRoot(TrainData_test, outputDir=self.dcoutdir.path) gen = dc.invokeGenerator() gen.setBatchSize(self.n_per_batch) for epoch in range(10): gen.prepareNextEpoch() print("epoch",epoch,'batches',gen.getNBatches()) for b in range(gen.getNBatches()): d,_ = next(gen.feedNumpyData()) data,rs = d[0],d[1] rs = np.array(rs[:,0],dtype='int') rs = rs[:rs[-1]] #print(data) #print(rs[-1]) if not raggedtester.checkData(data, rs): print('epoch',epoch, 'batch',b,'broken') passed=False break if rs[-1] > self.n_per_batch: print('maximum batch size exceeded for batch ',b, 'epoch', epoch) print('shuffling') gen.shuffleFilelist() return passed
def merge_successful(): output_dir = get_output_dir() from DeepJetCore.DataCollection import DataCollection alldc = [] for s in successful: in_path = output_dir + '/conversion.' + str(s) + '.dc' dc = None try: dc = DataCollection(in_path) except Exception as e: print('problems adding ' + in_path + " will continue nevertheless... (error see below)") print(e) if dc: alldc.append(DataCollection(in_path)) print("merging DataCollections") merged = sum(alldc) print("saving merged DataCollection") merged.writeToFile('%s/dataCollection.dc' % output_dir) print('successfully merged to %s/dataCollection.dc' % output_dir) return merged
def dcToDf(dc_file, df_out): dc = DataCollection() dc.readFromFile(dc_file) NENT = 1 # Can skip some events filelist = [] i = 0 storeInputs = True count = 0 feature_names = dc.dataclass.branches[1] spectator_names = dc.dataclass.branches[0] labels_names = dc.getUsedTruth() labels_names = ['truth' + l for l in labels_names] for s in dc.samples: if count > 1000000: break spath = dc.getSamplePath(s) filelist.append(spath) h5File = h5py.File(spath) f = h5File features_val_i = [ h5File['x%i' % j][()] for j in range(0, h5File['x_listlength'][()][0]) ] features_val_i = features_val_i[0][::NENT, 0, :] #predict_test_i = model.predict(features_val) weights_val_i = h5File['w0'][()] labels_val_i = h5File['y0'][()][::NENT, :] spectators_val_i = h5File['z0'][()][::NENT, 0, :] if storeInputs: raw_features_val_i = h5File['z1'][()][::NENT, 0, :] if i == 0: #predict_test = predict_test_i weights_val = weights_val_i labels_val = labels_val_i spectators_val = spectators_val_i features_val = features_val_i if storeInputs: raw_features_val = raw_features_val_i else: #predict_test = np.concatenate((predict_test,predict_test_i)) weights_val = np.concatenate((weights_val, weights_val_i)) labels_val = np.concatenate((labels_val, labels_val_i)) features_val = np.concatenate((features_val, features_val_i)) spectators_val = np.concatenate((spectators_val, spectators_val_i)) if storeInputs: raw_features_val = np.concatenate( (raw_features_val, raw_features_val_i)) i += 1 count += labels_val.shape[0] entries = np.hstack((raw_features_val, spectators_val, labels_val, weights_val.reshape((len(weights_val), 1)))) df = pd.DataFrame(entries, columns=feature_names + spectator_names + labels_names + ['weight']) #df = pd.DataFrame(raw_features_val+spectators_val , columns = feature_names+spectator_names) #print df if df_out != None: df.to_pickle(df_out) print "Saved df to", df_out
def loadModel(inputDir, trainData, model, LoadModel, sampleDatasets=None, removedVars=None, adv=False): inputModel = '%s/KERAS_check_best_model.h5' % inputDir from DeepJetCore.DataCollection import DataCollection traind = DataCollection() traind.readFromFile(trainData) traind.dataclass.regressiontargetclasses = range(0, NBINS) print(traind.getNRegressionTargets()) if (LoadModel): evalModel = load_model(inputModel, custom_objects=global_loss_list) shapes = traind.getInputShapes() else: shapes = traind.getInputShapes() train_inputs = [] for s in shapes: train_inputs.append(keras.layers.Input(shape=s)) modelargs = {} if adv: modelargs.update({ 'nRegTargets': NBINS, 'discTrainable': True, 'advTrainable': True }) evalModel = model(train_inputs, traind.getNClassificationTargets(), traind.getNRegressionTargets(), sampleDatasets, removedVars, **modelargs) evalModel.load_weights(inputModel) return evalModel
x = Dense(nregressions, activation=None, name='dense_fracs', kernel_initializer=keras.initializers.RandomNormal( mean=0.0, stddev=0.01))(x) x = Concatenate(name="concatlast", axis=-1)([x] + coords + [n_showers] + [etas_phis]) x = Multiply()([x, mask]) predictions = [x] return Model(inputs=Inputs, outputs=predictions) train = training_base(testrun=False, resumeSilently=True, renewtokens=True) plotdc = DataCollection( os.path.dirname(os.path.realpath(train.inputData)) + '/merged_test.dc') samplefile = plotdc.getSamplePath(plotdc.samples[0]) #gets called every epoch def decay_function(aftern_batches): return aftern_batches # int(aftern_batches+5) ppdts = [ plot_truth_pred_plus_coords_during_training( samplefile=samplefile, output_file=train.outputDir + '/train_progress' + str(0), use_event=use_event, x_index=5,
farr = simpleArray() farr.createFromNumpy(feat, rs) truth[:, 1] = truth[:, 16] tarr = simpleArray() tarr.createFromNumpy(truth, rs) #tarr.cout() td_out = TrainData_window() td_out._store([farr], [tarr], []) td_out.writeToFile(outdir + infile) print(infile, 'done') dc = DataCollection(inputdcfile) inputdir = dc.dataDir if not inputdir[:-1] == os.getcwd(): print('needs to be called in same dir as dataCollection file', inputdir, os.getcwd()) inputdatafiles = [] for s in dc.samples: inputdatafiles.append(s) from multiprocessing import Pool p = Pool() res = p.map(replace, inputdatafiles)
import tempfile import atexit import os from keras.models import load_model from keras import backend as K from DeepJetCore.customObjects import get_custom_objects from DeepJetCore.training.gpuTools import DJCSetGPUs inputdatafiles=[] inputdir=None ## prepare input lists for different file formats if args.inputSourceFileList[-6:] == ".djcdc": print('reading from data collection',args.inputSourceFileList) predsamples = DataCollection(args.inputSourceFileList) inputdir = predsamples.dataDir for s in predsamples.samples: inputdatafiles.append(s) elif args.inputSourceFileList[-6:] == ".djctd": inputdir = os.path.abspath(os.path.dirname(args.inputSourceFileList)) infile = os.path.basename(args.inputSourceFileList) inputdatafiles.append(infile) else: print('reading from text file',args.inputSourceFileList) inputdir = os.path.abspath(os.path.dirname(args.inputSourceFileList)) with open(args.inputSourceFileList, "r") as f: for s in f: inputdatafiles.append(s.replace('\n', '').replace(" ",""))
if len(args.files) < 1: print('you must provide at least one input file') exit() if not len(args.o): print('you must provide an output file name') exit() indir = os.path.dirname(args.files[0]) if len(indir): indir += "/" class_name = args.c if class_name in class_options: traind = class_options[class_name] else: print('available classes:') for key, val in class_options.iteritems(): print(key) raise Exception('wrong class selection') dc = DataCollection() dc.setDataClass(traind) for f in args.files: dc.samples.append(os.path.basename(f)) outfile = args.o if not outfile[-6:] == ".djcdc": outfile += ".djcdc" dc.writeToFile(indir + outfile)
LoadModel = False removedVars = None forceNClasses = False signals = [1] sigNames = ['Hbb'] backgrounds = [0] backNames = ['QCD'] NClasses = len(signals) + len(backgrounds) if True: evalModel = loadModel(trainDir, inputTrainDataCollection, trainingModel, LoadModel, forceNClasses, NClasses, inputDataset, removedVars) evalDir = opts.o from DeepJetCore.DataCollection import DataCollection testd = DataCollection() testd.readFromFile(inputTestDataCollection) if os.path.isdir(evalDir): raise Exception('output directory: %s must not exists yet' % evalDir) else: os.mkdir(evalDir) df, features_val = makePlots(testd, evalModel, evalDir) makeLossPlot(trainDir, evalDir) #df = evaluate(testd, inputTrainDataCollection, evalModel, evalDir) #make_plots(evalDir, savedir='Plots')
from collections import Counter from argparse import ArgumentParser parser = ArgumentParser('Dataset validation hplots script') parser.add_argument('-d', help="Data collection file") parser.add_argument('-p', help="PDF file path (will be ignored in validate mode)") parser.add_argument('-n', help="Number of events to produce dataset stats pdf on", default="50") parser.add_argument('--validate', dest='validate', action='store_true') parser.set_defaults(validate=False) args = parser.parse_args() dc = DataCollection(args.d) td = dc.dataclass() #this is actually saved #JK: this combination enforces one event per batch, then the extra row split loop is not needed batchsize = 1 dc.setBatchSize(batchsize) print("Invoking generator") gen = dc.invokeGenerator() gen.setSkipTooLargeBatches(False) # gen.setBuffer(td) print("n batches") n_batches = gen.getNBatches() print(n_batches) print("probably ready") #gpus = tf.config.list_physical_devices('GPU') gpus = 0
from DeepJetCore.DataCollection import DataCollection from pprint import pprint dc = DataCollection() dc.readFromFile( 'dc/dataCollection.dc' ) #/storage/9/dseith/DeepJet/deepCSV/results/../../Ntuples/Thu_135917_batch/dataCollections/deepCSV/train/dataCollection.dc') #dc.readFromFile('/storage/9/dseith/DeepJet/deepCSV/results/../../Ntuples/Thu_135917_batch/dataCollections/deepFlavour_FT_reg/train/dataCollection.dc') #pprint (dc.means[0]) #print '-'*100 #pprint (dc.means[1]) #print '-'*100 #pprint (dc.means.dtype.names) #pprint (dc.means[0][0].dtype) #pprint (dc.useweights) #pprint (dc.weighter) #pprint (dc.samples) #pprint (dc.sampleentries) #pprint (dc.originRoots) #pprint (dc.nsamples) #pprint (dc.useweights) ##pprint (dc.__batchsize) pprint(dc.dataclass) #pprint (dc.weighter) #pprint (dc.means) six_times = [ 'TagVarCSVTrk_trackJetDistVal', 'TagVarCSVTrk_trackPtRel', 'TagVarCSVTrk_trackDeltaR', 'TagVarCSVTrk_trackPtRatio', 'TagVarCSVTrk_trackSip3dSig', 'TagVarCSVTrk_trackSip2dSig', 'TagVarCSVTrk_trackDecayLenVal'
from argparse import ArgumentParser parser = ArgumentParser('merge or split files belonging to a dataCollection differently. The output will be written to the current working directory!') parser.add_argument("infile", help="input \"dc\" file") parser.add_argument("nelementsperfile", help="number of entries per file (output), for ragged, maximum number of elements") parser.add_argument("--randomise", help="randomise order, could be helpful if difference samples need to be mixed", action='store_true') args=parser.parse_args() from DeepJetCore.DataCollection import DataCollection from DeepJetCore.dataPipeline import TrainDataGenerator infile=args.infile nbatch=int(args.nelementsperfile) randomise = args.randomise dc = DataCollection(infile) dc2 = DataCollection(infile) samples = dc.samples dir = dc.dataDir if len(dir)<1: dir='.' insamples = [dir+'/'+s for s in samples] gen = TrainDataGenerator() gen.setBatchSize(nbatch) gen.setSkipTooLargeBatches(False) gen.setFileList(insamples) if randomise: gen.shuffleFileList()
args = parser.parse_args() batchsize = int(args.b) #if os.path.isdir(args.outputDir): # raise Exception('output directory must not exists yet') custom_objs = {} custom_objs.update(djc_global_loss_list) custom_objs.update(djc_global_layers_list) custom_objs.update(global_loss_list) custom_objs.update(global_layers_list) custom_objs.update(global_metrics_list) model = load_model(args.inputModel, custom_objects=custom_objs) dc = DataCollection(args.trainingDataCollection) td = dc.dataclass() outputs = [] inputdir = os.path.abspath(os.path.dirname(args.inputSourceFileList)) os.system('mkdir -p ' + args.outputDir) with open(args.inputSourceFileList, "r") as f: for inputfile in f: inputfile = inputfile.replace('\n', '') outfilename = "pred_" + inputfile print('converting ' + inputfile) tmpdir = tempfile.mkdtemp(suffix="djcpred", dir="/dev/shm") def removeTmp(): os.system("rm -rf " + tmpdir)
print( 'creating a dummy datacollection for means/norms and weighter (can take a while)...' ) from DeepJetCore.DataCollection import DataCollection from DeepJetCore.conversion.conversion import class_options try: cls = class_options[args.c] except KeyError: raise Exception('wrong class selection') if not args.classArgs: args.classArgs = tuple() dc = DataCollection(nprocs=-1) dc.meansnormslimit = int(args.nforweighter) try: dc.convertListOfRootFiles( args.infile, cls(*args.classArgs), args.out, means_only=True, output_name='batch_template.dc', relpath=('' if args.noRelativePaths else os.path.dirname( os.path.realpath(args.infile)))) except: print 'The first round of root conversion failed' raise
if os.path.isdir(args.outputDir): raise Exception('output directory must not exists yet') custom_objs = {} custom_objs.update(global_loss_list) custom_objs.update(global_layers_list) model=load_model(args.inputModel, custom_objects=custom_objs) td=testDescriptor() if args.use: td.use_only = [int(i) for i in args.use.split(',')] from DeepJetCore.DataCollection import DataCollection testd=DataCollection() testd.readFromFile(args.inputDataCollection) os.mkdir(args.outputDir) td.makePrediction( model, testd, args.outputDir, store_labels = args.labels, monkey_class = args.monkey_class ) td.writeToTextFile(args.outputDir+'/tree_association.txt') # make the file reading entirely C++ # then it can be used for other studies
args = parser.parse_args() minbatch = int(args.min) maxbatch = int(args.max) n_plots = int(args.n) infile = args.inputFile batchsize = int(args.b) from DeepJetCore.DataCollection import DataCollection from index_dicts import create_truth_dict, create_feature_dict from ragged_plotting_tools import make_original_truth_shower_plot, createRandomizedColors import matplotlib import matplotlib.pyplot as plt import random dc = DataCollection(infile) dc.setBatchSize(batchsize) gen = dc.invokeGenerator() nbatches = gen.getNBatches() if maxbatch >= nbatches: raise ValueError("maxbatch >= nbatches in sample") if minbatch >= maxbatch: raise ValueError("minbatch >= maxbatch") events = random.sample(range(minbatch, maxbatch), n_plots) lastev = -1 n_plots_done = 0 print('scanning...') for i in range(nbatches): f, t = next(gen.feedNumpyData())
#!/usr/bin/env python3 from argparse import ArgumentParser parser = ArgumentParser( 'Check if all files in a dataset (datacollection) are ok or remove a specific entry\n' ) parser.add_argument('inputDataCollection') parser.add_argument('--remove', default="") parser.add_argument('--skip_first', default=0) args = parser.parse_args() from DeepJetCore.DataCollection import DataCollection dc = DataCollection(args.inputDataCollection) dc.writeToFile(args.inputDataCollection + ".backup") if not len(args.remove): dc.validate(remove=True, skip_first=int(args.skip_first)) else: dc.removeEntry(args.remove) print('total size after: ' + str(dc.nsamples)) dc.writeToFile(args.inputDataCollection)
do_write = False for iline, line in enumerate(source): if iline == args.inRange[0]: do_write = True elif iline == args.inRange[1]: break if do_write: path = os.path.realpath(os.path.join(relpath, line)) my_infile.write(path) infile = my_infile.name # new infile will always have absolute path relpath = '' # MAIN BODY # dc = DataCollection(nprocs=(1 if args.nothreads else -1)) dc.meansnormslimit = int(args.nforweighter) if len(nchilds): dc.nprocs = int(nchilds) if args.batch is not None: dc.batch_mode = True traind = None if class_name in class_options: traind = class_options[class_name] elif not recover and not testdatafor: print('available classes:') for key, val in class_options.iteritems(): print(key) raise Exception('wrong class selection')
else: print('available classes:') for key, val in class_options.iteritems(): print(key) raise Exception('wrong class selection') if not ".dc" in infile: raise Exception('wrong input file '+infile) dir = os.path.dirname(infile) dcold = DCOld() dcold.readRawFromFile(infile) dcnew = DataCollection() dcnew.dataclass = traind() dcnew.samples = [s[:-4]+'djctd' for s in dcold.samples] print(dcnew.samples) dcnew.sourceList = dcold.originRoots # leave traindata undefined no way to convert. dcnew.__nsamples = 0 # determine again, also check outfile = infile[:-2] +'djcdc' print("infile: ", infile, " outfile", outfile) def worker(i): td = TDOld() tdnew = TrainData() print("converting",dcold.samples[i])
'When running in batch mode you should also ' 'provide a means source through the --usemeansfrom option') if args.v: logging.getLogger().setLevel(logging.DEBUG) elif args.q: logging.getLogger().setLevel(logging.WARNING) if infile: logging.info("infile = %s" % infile) if outPath: logging.info("outPath = %s" % outPath) # MAIN BODY # dc = DataCollection( nprocs=(1 if args.nothreads else -1), useRelativePaths=True if not args.noRelativePaths else False) if len(nchilds): dc.nprocs = int(nchilds) if class_name in class_options: traind = class_options[class_name] elif not recover and not testdatafor: print('available classes:') for key, val in class_options.iteritems(): print(key) raise Exception('wrong class selection') if testdatafor: logging.info('converting test data, no weights applied') dc.createTestDataForDataCollection( testdatafor,
do_write = False for iline, line in enumerate(source): if iline == args.inRange[0]: do_write = True elif iline == args.inRange[1]: break if do_write: path = os.path.realpath(os.path.join(relpath, line)) my_infile.write(path) infile = my_infile.name # new infile will always have absolute path relpath = '' # MAIN BODY # dc = DataCollection(nprocs=(1 if args.nothreads else -1)) dc.meansnormslimit = int(args.nforweighter) dc.no_copy_on_convert = args.noramcopy if len(nchilds): dc.nprocs = int(nchilds) if args.batch is not None: dc.batch_mode = True traind = None if class_name in class_options: traind = class_options[class_name] elif not recover and not testdatafor: print('available classes:') for key, val in class_options.iteritems(): print(key) raise Exception('wrong class selection')
class training_base(object): def __init__(self, splittrainandtest=0.85, useweights=False, testrun=False, testrun_fraction=0.1, resumeSilently=False, renewtokens=True, collection_class=DataCollection, parser=None, recreate_silently=False): import sys scriptname = sys.argv[0] if parser is None: parser = ArgumentParser('Run the training') parser.add_argument('inputDataCollection') parser.add_argument('outputDir') parser.add_argument( '--modelMethod', help= 'Method to be used to instantiate model in derived training class', metavar='OPT', default=None) parser.add_argument("--gpu", help="select specific GPU", metavar="OPT", default="") parser.add_argument("--gpufraction", help="select memory fraction for GPU", type=float, metavar="OPT", default=-1) parser.add_argument("--submitbatch", help="submits the job to condor", default=False, action="store_true") parser.add_argument( "--walltime", help= "sets the wall time for the batch job, format: 1d5h or 2d or 3h etc", default='1d') parser.add_argument("--isbatchrun", help="is batch run", default=False, action="store_true") parser.add_argument("--valdata", help="set validation dataset (optional)", default="") parser.add_argument( "--takeweights", help= "Applies weights from the model given as relative or absolute path. Matches by names and skips layers that don't match.", default="") args = parser.parse_args() self.args = args import sys self.argstring = sys.argv #sanity check if args.isbatchrun: args.submitbatch = False resumeSilently = True if args.submitbatch: print( 'submitting batch job. Model will be compiled for testing before submission (GPU settings being ignored)' ) import matplotlib #if no X11 use below matplotlib.use('Agg') DJCSetGPUs(args.gpu) if args.gpufraction > 0 and args.gpufraction < 1: import sys import tensorflow as tf gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpufraction) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) import keras from keras import backend as K K.set_session(sess) print('using gpu memory fraction: ' + str(args.gpufraction)) import keras self.ngpus = 1 self.dist_strat_scope = None if len(args.gpu): self.ngpus = len([i for i in args.gpu.split(',')]) print('running on ' + str(self.ngpus) + ' gpus') if self.ngpus > 1: import tensorflow as tf self.dist_strat_scope = tf.distribute.MirroredStrategy() self.keras_inputs = [] self.keras_inputsshapes = [] self.keras_model = None self.keras_model_method = args.modelMethod self.keras_weight_model_path = args.takeweights self.train_data = None self.val_data = None self.startlearningrate = None self.optimizer = None self.trainedepoches = 0 self.compiled = False self.checkpointcounter = 0 self.renewtokens = renewtokens if args.isbatchrun: self.renewtokens = False self.callbacks = None self.custom_optimizer = False self.copied_script = "" self.submitbatch = args.submitbatch self.GAN_mode = False self.inputData = os.path.abspath(args.inputDataCollection) \ if ',' not in args.inputDataCollection else \ [os.path.abspath(i) for i in args.inputDataCollection.split(',')] self.outputDir = args.outputDir # create output dir isNewTraining = True if os.path.isdir(self.outputDir): if not (resumeSilently or recreate_silently): var = input( 'output dir exists. To recover a training, please type "yes"\n' ) if not var == 'yes': raise Exception('output directory must not exists yet') isNewTraining = False if recreate_silently: isNewTraining = True else: os.mkdir(self.outputDir) self.outputDir = os.path.abspath(self.outputDir) self.outputDir += '/' if recreate_silently: os.system('rm -rf ' + self.outputDir + '*') #copy configuration to output dir if not args.isbatchrun: try: shutil.copyfile(scriptname, self.outputDir + os.path.basename(scriptname)) except shutil.SameFileError: pass except BaseException as e: raise e self.copied_script = self.outputDir + os.path.basename(scriptname) else: self.copied_script = scriptname self.train_data = collection_class() self.train_data.readFromFile(self.inputData) self.train_data.useweights = useweights if len(args.valdata): print('using validation data from ', args.valdata) self.val_data = DataCollection(args.valdata) else: if testrun: if len(self.train_data) > 1: self.train_data.split(testrun_fraction) self.train_data.dataclass_instance = None #can't be pickled self.val_data = copy.deepcopy(self.train_data) else: self.val_data = self.train_data.split(splittrainandtest) shapes = self.train_data.getKerasFeatureShapes() inputdtypes = self.train_data.getKerasFeatureDTypes() inputnames = self.train_data.getKerasFeatureArrayNames() for i in range(len(inputnames)): if inputnames[i] == "" or inputnames[i] == "_rowsplits": inputnames[i] = "input_" + str(i) + inputnames[i] print("shapes", shapes) print("inputdtypes", inputdtypes) print("inputnames", inputnames) self.keras_inputs = [] self.keras_inputsshapes = [] counter = 0 for s, dt, n in zip(shapes, inputdtypes, inputnames): self.keras_inputs.append( keras.layers.Input(shape=s, dtype=dt, name=n)) self.keras_inputsshapes.append(s) if not isNewTraining: kfile = self.outputDir+'/KERAS_check_model_last.h5' \ if os.path.isfile(self.outputDir+'/KERAS_check_model_last.h5') else \ self.outputDir+'/KERAS_model.h5' if os.path.isfile(kfile): print(kfile) if self.dist_strat_scope is not None: with self.dist_strat_scope.scope(): self.loadModel(kfile) else: self.loadModel(kfile) self.trainedepoches = 0 if os.path.isfile(self.outputDir + 'losses.log'): for line in open(self.outputDir + 'losses.log'): valloss = line.split(' ')[1][:-1] if not valloss == "None": self.trainedepoches += 1 else: print( 'incomplete epochs, starting from the beginning but with pretrained model' ) else: print( 'no model found in existing output dir, starting training from scratch' ) def __del__(self): if hasattr(self, 'train_data'): del self.train_data del self.val_data def modelSet(self): return (not self.keras_model == None) and not len( self.keras_weight_model_path) def setDJCKerasModel(self, model, *args, **kwargs): if len(self.keras_inputs) < 1: raise Exception('setup data first') self.keras_model = model(*args, **kwargs) if hasattr(self.keras_model, "_is_djc_keras_model"): self.keras_model.setInputShape(self.keras_inputs) self.keras_model.build(None) if not self.keras_model: raise Exception('Setting DJCKerasModel not successful') def setModel(self, model, **modelargs): if len(self.keras_inputs) < 1: raise Exception('setup data first') if self.dist_strat_scope is not None: with self.dist_strat_scope.scope(): self.keras_model = model(self.keras_inputs, **modelargs) else: self.keras_model = model(self.keras_inputs, **modelargs) if hasattr(self.keras_model, "_is_djc_keras_model"): #compatibility self.keras_model.setInputShape(self.keras_inputs) self.keras_model.build(None) if len(self.keras_weight_model_path): from DeepJetCore.modeltools import apply_weights_where_possible, load_model self.keras_model = apply_weights_where_possible( self.keras_model, load_model(self.keras_weight_model_path)) #try: # self.keras_model=model(self.keras_inputs,**modelargs) #except BaseException as e: # print('problem in setting model. Reminder: since DJC 2.0, NClassificationTargets and RegressionTargets must not be specified anymore') # raise e if not self.keras_model: raise Exception('Setting model not successful') def saveCheckPoint(self, addstring=''): self.checkpointcounter = self.checkpointcounter + 1 self.saveModel("KERAS_model_checkpoint_" + str(self.checkpointcounter) + "_" + addstring + ".h5") def loadModel(self, filename): from keras.models import load_model self.keras_model = load_model(filename, custom_objects=custom_objects_list) self.optimizer = self.keras_model.optimizer self.compiled = True if self.ngpus > 1: self.compiled = False def setCustomOptimizer(self, optimizer): self.optimizer = optimizer self.custom_optimizer = True def compileModel(self, learningrate, clipnorm=None, discriminator_loss=['binary_crossentropy'], print_models=False, metrics=None, **compileargs): if not self.keras_model and not self.GAN_mode: raise Exception('set model first') if self.ngpus > 1 and not self.submitbatch: print('Model being compiled for ' + str(self.ngpus) + ' gpus') self.startlearningrate = learningrate if not self.custom_optimizer: from keras.optimizers import Adam if clipnorm: self.optimizer = Adam(lr=self.startlearningrate, clipnorm=clipnorm) else: self.optimizer = Adam(lr=self.startlearningrate) if self.dist_strat_scope is not None: with self.dist_strat_scope.scope(): self.keras_model.compile(optimizer=self.optimizer, metrics=metrics, **compileargs) else: self.keras_model.compile(optimizer=self.optimizer, metrics=metrics, **compileargs) if print_models: print(self.keras_model.summary()) self.compiled = True def compileModelWithCustomOptimizer(self, customOptimizer, **compileargs): raise Exception( 'DEPRECATED: please use setCustomOptimizer before calling compileModel' ) def saveModel(self, outfile): if not self.GAN_mode: self.keras_model.save(self.outputDir + outfile) else: self.gan.save(self.outputDir + 'GAN_' + outfile) self.generator.save(self.outputDir + 'GEN_' + outfile) self.discriminator.save(self.outputDir + 'DIS_' + outfile) #import h5py #f = h5py.File(self.outputDir+outfile, 'r+') #del f['optimizer_weights'] #f.close() def _initTraining(self, nepochs, batchsize, use_sum_of_squares=False): if self.submitbatch: from DeepJetCore.training.batchTools import submit_batch submit_batch(self, self.args.walltime) exit() #don't delete this! self.train_data.setBatchSize(batchsize) self.val_data.setBatchSize(batchsize) self.train_data.batch_uses_sum_of_squares = use_sum_of_squares self.val_data.batch_uses_sum_of_squares = use_sum_of_squares self.train_data.writeToFile(self.outputDir + 'trainsamples.djcdc') self.val_data.writeToFile(self.outputDir + 'valsamples.djcdc') #make sure tokens don't expire from .tokenTools import checkTokens, renew_token_process from _thread import start_new_thread if self.renewtokens: print('starting afs backgrounder') checkTokens() start_new_thread(renew_token_process, ()) self.train_data.setBatchSize(batchsize) self.val_data.setBatchSize(batchsize) def trainModel( self, nepochs, batchsize, run_eagerly=False, batchsize_use_sum_of_squares=False, extend_truth_list_by=0, #extend the truth list with dummies. Useful when adding more prediction outputs than truth inputs stop_patience=-1, lr_factor=0.5, lr_patience=-1, lr_epsilon=0.003, lr_cooldown=6, lr_minimum=0.000001, checkperiod=10, backup_after_batches=-1, additional_plots=None, additional_callbacks=None, load_in_mem=False, max_files=-1, plot_batch_loss=False, **trainargs): self.keras_model.run_eagerly = run_eagerly # write only after the output classes have been added self._initTraining(nepochs, batchsize, batchsize_use_sum_of_squares) self.keras_model.save(self.outputDir + 'KERAS_untrained_model.h5') print('setting up callbacks') from .DeepJet_callbacks import DeepJet_callbacks minTokenLifetime = 5 if not self.renewtokens: minTokenLifetime = -1 self.callbacks = DeepJet_callbacks( self.keras_model, stop_patience=stop_patience, lr_factor=lr_factor, lr_patience=lr_patience, lr_epsilon=lr_epsilon, lr_cooldown=lr_cooldown, lr_minimum=lr_minimum, outputDir=self.outputDir, checkperiod=checkperiod, backup_after_batches=backup_after_batches, checkperiodoffset=self.trainedepoches, additional_plots=additional_plots, batch_loss=plot_batch_loss, minTokenLifetime=minTokenLifetime) if additional_callbacks is not None: if not isinstance(additional_callbacks, list): additional_callbacks = [additional_callbacks] self.callbacks.callbacks.extend(additional_callbacks) print('starting training') if load_in_mem: if match_truth_and_pred_list: raise ValueError( "match_truth_and_pred_list not available with load_in_mem") print('make features') X_train = self.train_data.getAllFeatures(nfiles=max_files) X_test = self.val_data.getAllFeatures(nfiles=max_files) print('make truth') Y_train = self.train_data.getAllLabels(nfiles=max_files) Y_test = self.val_data.getAllLabels(nfiles=max_files) self.keras_model.fit(X_train, Y_train, batch_size=batchsize, epochs=nepochs, callbacks=self.callbacks.callbacks, validation_data=(X_test, Y_test), max_queue_size=1, use_multiprocessing=False, workers=0, **trainargs) else: #prepare generator print("setting up generator... can take a while") traingen = self.train_data.invokeGenerator() valgen = self.val_data.invokeGenerator() #this is fixed traingen.extend_truth_list_by = extend_truth_list_by valgen.extend_truth_list_by = extend_truth_list_by while (self.trainedepoches < nepochs): #this can change from epoch to epoch #calculate steps for this epoch #feed info below traingen.prepareNextEpoch() valgen.prepareNextEpoch() nbatches_train = traingen.getNBatches( ) #might have changed due to shuffeling nbatches_val = valgen.getNBatches() print('>>>> epoch', self.trainedepoches, "/", nepochs) print('training batches: ', nbatches_train) print('validation batches: ', nbatches_val) self.keras_model.fit(traingen.feedNumpyData(), steps_per_epoch=nbatches_train, epochs=self.trainedepoches + 1, initial_epoch=self.trainedepoches, callbacks=self.callbacks.callbacks, validation_data=valgen.feedNumpyData(), validation_steps=nbatches_val, max_queue_size=1, use_multiprocessing=False, workers=0, **trainargs) self.trainedepoches += 1 traingen.shuffleFilelist() # self.saveModel("KERAS_model.h5") return self.keras_model, self.callbacks.history def change_learning_rate(self, new_lr): import keras.backend as K if self.GAN_mode: K.set_value(self.discriminator.optimizer.lr, new_lr) K.set_value(self.gan.optimizer.lr, new_lr) else: K.set_value(self.keras_model.optimizer.lr, new_lr)
#!/usr/bin/env python # encoding: utf-8 from argparse import ArgumentParser from DeepJetCore.DataCollection import DataCollection parser = ArgumentParser( 'convert a data collection to a single set of numpy arrays. Warning, this can produce a large output' ) parser.add_argument('inputDataCollection') parser.add_argument('outputFilePrefix') args = parser.parse_args() print('reading data collection') dc = DataCollection() dc.readFromFile(args.inputDataCollection) print('producing feature array') feat = dc.getAllFeatures() print('producing truth array') truth = dc.getAllLabels() print('producing weight array') weight = dc.getAllWeights() print('producing means and norms array') means = dc.means from numpy import save
def __init__(self, splittrainandtest=0.85, useweights=False, testrun=False, testrun_fraction=0.1, resumeSilently=False, renewtokens=True, collection_class=DataCollection, parser=None, recreate_silently=False): import sys scriptname = sys.argv[0] if parser is None: parser = ArgumentParser('Run the training') parser.add_argument('inputDataCollection') parser.add_argument('outputDir') parser.add_argument( '--modelMethod', help= 'Method to be used to instantiate model in derived training class', metavar='OPT', default=None) parser.add_argument("--gpu", help="select specific GPU", metavar="OPT", default="") parser.add_argument("--gpufraction", help="select memory fraction for GPU", type=float, metavar="OPT", default=-1) parser.add_argument("--submitbatch", help="submits the job to condor", default=False, action="store_true") parser.add_argument( "--walltime", help= "sets the wall time for the batch job, format: 1d5h or 2d or 3h etc", default='1d') parser.add_argument("--isbatchrun", help="is batch run", default=False, action="store_true") parser.add_argument("--valdata", help="set validation dataset (optional)", default="") parser.add_argument( "--takeweights", help= "Applies weights from the model given as relative or absolute path. Matches by names and skips layers that don't match.", default="") args = parser.parse_args() self.args = args import sys self.argstring = sys.argv #sanity check if args.isbatchrun: args.submitbatch = False resumeSilently = True if args.submitbatch: print( 'submitting batch job. Model will be compiled for testing before submission (GPU settings being ignored)' ) import matplotlib #if no X11 use below matplotlib.use('Agg') DJCSetGPUs(args.gpu) if args.gpufraction > 0 and args.gpufraction < 1: import sys import tensorflow as tf gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpufraction) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) import keras from keras import backend as K K.set_session(sess) print('using gpu memory fraction: ' + str(args.gpufraction)) import keras self.ngpus = 1 self.dist_strat_scope = None if len(args.gpu): self.ngpus = len([i for i in args.gpu.split(',')]) print('running on ' + str(self.ngpus) + ' gpus') if self.ngpus > 1: import tensorflow as tf self.dist_strat_scope = tf.distribute.MirroredStrategy() self.keras_inputs = [] self.keras_inputsshapes = [] self.keras_model = None self.keras_model_method = args.modelMethod self.keras_weight_model_path = args.takeweights self.train_data = None self.val_data = None self.startlearningrate = None self.optimizer = None self.trainedepoches = 0 self.compiled = False self.checkpointcounter = 0 self.renewtokens = renewtokens if args.isbatchrun: self.renewtokens = False self.callbacks = None self.custom_optimizer = False self.copied_script = "" self.submitbatch = args.submitbatch self.GAN_mode = False self.inputData = os.path.abspath(args.inputDataCollection) \ if ',' not in args.inputDataCollection else \ [os.path.abspath(i) for i in args.inputDataCollection.split(',')] self.outputDir = args.outputDir # create output dir isNewTraining = True if os.path.isdir(self.outputDir): if not (resumeSilently or recreate_silently): var = input( 'output dir exists. To recover a training, please type "yes"\n' ) if not var == 'yes': raise Exception('output directory must not exists yet') isNewTraining = False if recreate_silently: isNewTraining = True else: os.mkdir(self.outputDir) self.outputDir = os.path.abspath(self.outputDir) self.outputDir += '/' if recreate_silently: os.system('rm -rf ' + self.outputDir + '*') #copy configuration to output dir if not args.isbatchrun: try: shutil.copyfile(scriptname, self.outputDir + os.path.basename(scriptname)) except shutil.SameFileError: pass except BaseException as e: raise e self.copied_script = self.outputDir + os.path.basename(scriptname) else: self.copied_script = scriptname self.train_data = collection_class() self.train_data.readFromFile(self.inputData) self.train_data.useweights = useweights if len(args.valdata): print('using validation data from ', args.valdata) self.val_data = DataCollection(args.valdata) else: if testrun: if len(self.train_data) > 1: self.train_data.split(testrun_fraction) self.train_data.dataclass_instance = None #can't be pickled self.val_data = copy.deepcopy(self.train_data) else: self.val_data = self.train_data.split(splittrainandtest) shapes = self.train_data.getKerasFeatureShapes() inputdtypes = self.train_data.getKerasFeatureDTypes() inputnames = self.train_data.getKerasFeatureArrayNames() for i in range(len(inputnames)): if inputnames[i] == "" or inputnames[i] == "_rowsplits": inputnames[i] = "input_" + str(i) + inputnames[i] print("shapes", shapes) print("inputdtypes", inputdtypes) print("inputnames", inputnames) self.keras_inputs = [] self.keras_inputsshapes = [] counter = 0 for s, dt, n in zip(shapes, inputdtypes, inputnames): self.keras_inputs.append( keras.layers.Input(shape=s, dtype=dt, name=n)) self.keras_inputsshapes.append(s) if not isNewTraining: kfile = self.outputDir+'/KERAS_check_model_last.h5' \ if os.path.isfile(self.outputDir+'/KERAS_check_model_last.h5') else \ self.outputDir+'/KERAS_model.h5' if os.path.isfile(kfile): print(kfile) if self.dist_strat_scope is not None: with self.dist_strat_scope.scope(): self.loadModel(kfile) else: self.loadModel(kfile) self.trainedepoches = 0 if os.path.isfile(self.outputDir + 'losses.log'): for line in open(self.outputDir + 'losses.log'): valloss = line.split(' ')[1][:-1] if not valloss == "None": self.trainedepoches += 1 else: print( 'incomplete epochs, starting from the beginning but with pretrained model' ) else: print( 'no model found in existing output dir, starting training from scratch' )