def get_best_models( self, best_sorts , remove_last=True, with_history=False): ''' This method will load the best models. Arguments: - best_sorts: the table that contains the best_sorts; - remove_last: a bolean variable to remove or not the tanh in tha output layer; - with_history: unused variable. ''' from tensorflow.keras.models import Model, model_from_json import json models = [[ None for _ in range(len(self.__etabins)-1)] for __ in range(len(self.__etbins)-1)] for et_bin in range(len(self.__etbins)-1): for eta_bin in range(len(self.__etabins)-1): d_tuned = {} best = best_sorts.loc[(best_sorts.et_bin==et_bin) & (best_sorts.eta_bin==eta_bin)] tuned = load(best.file_name.values[0])['tunedData'][best.model_idx.values[0]] model = model_from_json( json.dumps(tuned['sequence'], separators=(',', ':')) ) #custom_objects={'RpLayer':RpLayer} ) model.set_weights( tuned['weights'] ) new_model = Model(model.inputs, model.layers[-2].output) if remove_last else model #new_model.summary() d_tuned['model'] = new_model d_tuned['etBin'] = [self.__etbins[et_bin], self.__etbins[et_bin+1]] d_tuned['etaBin'] = [self.__etabins[eta_bin], self.__etabins[eta_bin+1]] d_tuned['etBinIdx'] = et_bin d_tuned['etaBinIdx']= eta_bin d_tuned['history'] = tuned['history'] models[et_bin][eta_bin] = d_tuned return models
def generator_for_signal(path): from Gaugi import load raw = load(path) features = raw['features'].tolist() data = raw['data'] target = raw['target'] return data[target==1,:], features
def dump_all_history( self, table, output_path , tag): ''' This method will dump the train history. This is a way to get more easy this information when plotting the train evolution. Arguments: - table: a table with the path information. - output_path: the path to sabe the hitories. - tag: the train tag. ''' if not os.path.exists( output_path ): os.mkdir( output_path ) for _ , row in table.iterrows(): if row.train_tag != tag: continue # Load history history = load( row.file_name )['tunedData'][row.tuned_idx]['history'] history['loc'] = {'et_bin' : row.et_bin, 'eta_bin' : row.eta_bin, 'sort' : row.sort, 'model_idx' : row.model_idx} name = 'history_et_%i_eta_%i_model_%i_sort_%i.json' % (row.et_bin,row.eta_bin,row.model_idx,row.sort) jbl_name = 'history_et_%i_eta_%i_model_%i_sort_%i.joblib' % (row.et_bin,row.eta_bin,row.model_idx,row.sort) joblib.dump(history['summary'], os.path.join(output_path, jbl_name)) history.pop('summary') with open(os.path.join(output_path, '%s' %name), 'w', encoding='utf-8') as fp: #json.dump(transform_serialize(history), fp) json.dump(str(history), fp)
def __call__( self , generator, tunedFile, outputfile, crossval, decorators): context = Context() MSG_INFO( self, "Opening file %s...", tunedFile ) raw = load(tunedFile) tunedData = TunedData_v1() for idx, tuned in enumerate(raw['tunedData']): # force the context is empty for each iteration context.clear() sort = tuned['sort'] init = tuned['init'] imodel = tuned['imodel'] history = tuned['history'] # get the current kfold and train, val sets x_train, x_val, y_train, y_val, index_from_cv = self.pattern_g( generator, crossval, sort ) # recover keras model model = model_from_json( json.dumps(tuned['sequence'], separators=(',', ':')) )#, custom_objects={'RpLayer':RpLayer} ) model.set_weights( tuned['weights'] ) # Should not be store context.setHandler( "valData" , (x_val, y_val) ) context.setHandler( "trnData" , (x_train, y_train) ) context.setHandler( "index" , index_from_cv ) context.setHandler( "crossval", crossval ) # It will be store into the file context.setHandler( "model" , model ) context.setHandler( "sort" , sort ) context.setHandler( "init" , init ) context.setHandler( "imodel" , imodel ) context.setHandler( "time" , tuned['time'] ) context.setHandler( "history" , history ) for tool in decorators: #MSG_INFO( self, "Executing the pos processor %s", tool.name() ) tool.decorate( history, context ) tunedData.attach_ctx( context ) try: MSG_INFO( self, "Saving file..." ) tunedData.save( outputfile+'/'+ tunedFile.split('/')[-1] ) except Exception as e: MSG_FATAL( self, "Its not possible to save the tuned data: %s" , e ) return StatusCode.SUCCESS
def generator( path ): def norm1( data ): norms = np.abs( data.sum(axis=1) ) norms[norms==0] = 1 return data/norms[:,None] from Gaugi import load import numpy as np d = load(path) feature_names = d['features'].tolist() data = norm1(d['data'][:,1:101]) target = d['target'] avgmu = d['data'][:,0] references = ['T0HLTElectronT2CaloTight','T0HLTElectronT2CaloMedium','T0HLTElectronT2CaloLoose','T0HLTElectronT2CaloVLoose'] ref_dict = {} for ref in references: answers = d['data'][:, feature_names.index(ref)] signal_passed = sum(answers[target==1]) signal_total = len(answers[target==1]) background_passed = sum(answers[target==0]) background_total = len(answers[target==0]) pd = signal_passed/signal_total fa = background_passed/background_total ref_dict[ref] = {'signal_passed': signal_passed, 'signal_total': signal_total, 'pd' : pd, 'background_passed': background_passed, 'background_total': background_total, 'fa': fa} return data, target, avgmu
def load(self, fList): from Gaugi import load from Gaugi import csvStr2List, expandFolders, progressbar fList = csvStr2List(fList) fList = expandFolders(fList) from saphyra import TunedData_v1 self._obj = TunedData_v1() for inputFile in progressbar(fList, len(fList), prefix="Reading tuned data collection...", logger=self._logger): raw = load(inputFile) # get the file version version = raw['__version'] # the current file version if version == 1: obj = TunedData_v1.fromRawObj(raw) self._obj.merge(obj) else: # error because the file does not exist self._logger.fatal('File version (%d) not supported in (%s)', version, inputFile) # return the list of keras models return self._obj
def getPatterns(path, cv, sort): def norm1(data): norms = np.abs(data.sum(axis=1)) norms[norms == 0] = 1 return data / norms[:, None] from Gaugi import load import numpy as np d = load(path) # ------------------------------------------------------- # # remove zero rings #m_rings = m_rings = list(range(8,80)) + list(range(88,100)) #data = norm1(d['data'][:,m_rings]) data = norm1(d['data'][:, 1:101]) # ------------------------------------------------------- # target = d['target'] target[target != 1] = -1 splits = [(train_index, val_index) for train_index, val_index in cv.split(data, target)] x_train = data[splits[sort][0]] y_train = target[splits[sort][0]] x_val = data[splits[sort][1]] y_val = target[splits[sort][1]] return x_train, x_val, y_train, y_val, splits, [] #d['features']
def fill(self, path, tag): ''' This method will fill the information dictionary and convert then into a pandas DataFrame. Arguments.: - path: the path to the tuned files; - tag: the training tag used; ''' paths = expandFolders( path ) MSG_INFO(self, "Reading file for %s tag from %s", tag , path) # Creating the dataframe dataframe = collections.OrderedDict({ 'train_tag' : [], 'et_bin' : [], 'eta_bin' : [], 'model_idx' : [], 'sort' : [], 'init' : [], 'file_name' : [], 'tuned_idx' : [], }) # Complete the dataframe for each varname in the config dict for varname in self.__config_dict.keys(): dataframe[varname] = [] MSG_INFO(self, 'There are %i files for this task...' %(len(paths))) MSG_INFO(self, 'Filling the table... ') for ituned_file_name in paths: gfile = load(ituned_file_name) tuned_file = gfile['tunedData'] for idx, ituned in enumerate(tuned_file): history = ituned['history'] #model = model_from_json( json.dumps(ituned['sequence'], separators=(',', ':')) , custom_objects={'RpLayer':RpLayer} ) #model.set_weights( ituned['weights'] ) # get the basic from model dataframe['train_tag'].append(tag) #dataframe['model'].append(model) dataframe['model_idx'].append(ituned['imodel']) dataframe['sort'].append(ituned['sort']) dataframe['init'].append(ituned['init']) dataframe['et_bin'].append(self.get_etbin(ituned_file_name)) dataframe['eta_bin'].append(self.get_etabin(ituned_file_name)) dataframe['file_name'].append(ituned_file_name) dataframe['tuned_idx'].append( idx ) # Get the value for each wanted key passed by the user in the contructor args. for key, local in self.__config_dict.items(): dataframe[key].append( self.__get_value( history, local ) ) # append tables if is need # ignoring index to avoid duplicated entries in dataframe self.__table = self.__table.append( pd.DataFrame(dataframe), ignore_index=True ) if not self.__table is None else pd.DataFrame(dataframe) MSG_INFO(self, 'End of fill step, a pandas DataFrame was created...')
def get_history(self, path, index): tuned_list = load(path)['tunedData'] for tuned in tuned_list: if tuned['imodel'] == index: return tuned['history'] MSG_FATAL(self, "It's not possible to find the history for model index %d", index)
def __call__(self, inputFiles): obj = None for idx, f in progressbar(enumerate(inputFiles), len(inputFiles), 'Reading...: ', 60, logger=self._logger): #d = dict(np.load(f,allow_pickle=True)) d = dict(load(f)) obj = self.merge(d, obj, self._skip_these_keys) if obj else d return obj
def dump_all_history( self, table, output_path , tag): if not os.path.exists( output_path ): os.mkdir( output_path ) for _ , row in table.iterrows(): if row.train_tag != tag: continue # Load history history = load( row.file_name )['tunedData'][row.tuned_idx]['history'] history['loc'] = {'et_bin' : row.et_bin, 'eta_bin' : row.eta_bin, 'sort' : row.sort, 'model_idx' : row.model_idx} name = 'history_et_%i_eta_%i_model_%i_sort_%i.json' % (row.et_bin,row.eta_bin,row.model_idx,row.sort) with open(os.path.join(output_path, '%s' %name), 'w') as fp: #json.dump(transform_serialize(history), fp) json.dump(str(history), fp)
def load( self, ofile ): from Gaugi import load raw = load( ofile ) # get the file version version = raw['__version'] # the current file version if version == 1: import self._obj = Job_v1.fromRawObj( raw ) else: # error because the file does not exist self._logger.fatal( 'File version (%d) not supported in (%s)', version, ofile) # return the list of keras models return self._obj
def getPatterns(path, cv, sort): from Gaugi import load def norm1(data): norms = np.abs(data.sum(axis=1)) norms[norms == 0] = 1 #return np.expand_dims( data/norms[:,None], axis=2 ) return data / norms[:, None] # Load data d = load(path) feature_names = d['features'].tolist() # Get the normalized rings data_rings = norm1(d['data'][:, 1:101]) # How many events? n = data_rings.shape[0] # extract all shower shapes data_reta = d['data'][:, feature_names.index('reta')].reshape((n, 1)) data_rphi = d['data'][:, feature_names.index('rphi')].reshape((n, 1)) data_eratio = d['data'][:, feature_names.index('eratio')].reshape((n, 1)) data_weta2 = d['data'][:, feature_names.index('weta2')].reshape((n, 1)) data_f1 = d['data'][:, feature_names.index('f1')].reshape((n, 1)) # Get the mu average data_mu = d['data'][:, feature_names.index('avgmu')].reshape((n, 1)) target = d['target'] # This is mandatory splits = [(train_index, val_index) for train_index, val_index in cv.split(data_mu, target)] data_shower_shapes = np.concatenate( (data_reta, data_rphi, data_eratio, data_weta2, data_f1), axis=1) # split for this sort x_train = [ data_rings[splits[sort][0]], data_shower_shapes[splits[sort][0]] ] x_val = [data_rings[splits[sort][1]], data_shower_shapes[splits[sort][1]]] y_train = target[splits[sort][0]] y_val = target[splits[sort][1]] return x_train, x_val, y_train, y_val, splits
def fill(self, path, tag): paths = expandFolders(path) MSG_INFO(self, "Reading file for %s tag from %s", tag, path) # Creating the dataframe dataframe = collections.OrderedDict({ 'train_tag': [], 'et_bin': [], 'eta_bin': [], 'model_idx': [], 'sort': [], 'init': [], 'file_name': [], 'tuned_idx': [], }) # Complete the dataframe for each varname in the config dict for varname in self.__config_dict.keys(): dataframe[varname] = [] MSG_INFO(self, 'There are %i files for this task...' % (len(paths))) MSG_INFO(self, 'Filling the table... ') for ituned_file_name in paths: gfile = load(ituned_file_name) tuned_file = gfile['tunedData'] for idx, ituned in enumerate(tuned_file): history = ituned['history'] # get the basic from model dataframe['train_tag'].append(tag) dataframe['model_idx'].append(ituned['imodel']) dataframe['sort'].append(ituned['sort']) dataframe['init'].append(ituned['init']) dataframe['et_bin'].append(self.get_etbin(ituned_file_name)) dataframe['eta_bin'].append(self.get_etabin(ituned_file_name)) dataframe['file_name'].append(ituned_file_name) dataframe['tuned_idx'].append(idx) # Get the value for each wanted key passed by the user in the contructor args. for key, local in self.__config_dict.items(): dataframe[key].append(self.__get_value(history, local)) self.__table = self.__table.append( pd.DataFrame(dataframe) ) if not self.__table is None else pd.DataFrame(dataframe) MSG_INFO(self, 'End of fill step, a pandas DataFrame was created...')
from Gaugi import load import numpy as np import matplotlib import matplotlib.pyplot as plt from matplotlib.colors import LogNorm data = load('e.npz') features = data['features'].tolist() def plot_layers(sampling, d, norm=False, evt=None): def plot_cells(ax, cells, vmin=None, vmax=None, label='', norm=False): ax.imshow(cells, interpolation='nearest', aspect=cells.shape[1] / cells.shape[0], norm=LogNorm(vmin=vmin, vmax=vmax) if norm else None) ax.set(xlabel='$\eta$', ylabel='$\phi$', title=label) ax.tick_params(axis='both', which='both', bottom='off', top='off', left='off', right='off', labelbottom='off', labelleft='off') import random f, axarr = plt.subplots(1, 3, figsize=(18, 10)) evt = 1 #if evt is None:
def fill(self, path, tag): ''' This method will fill the information dictionary and convert then into a pandas DataFrame. Arguments.: - path: the path to the tuned files; - tag: the training tag used; ''' paths = expand_folders(path) MSG_INFO(self, "Reading file for %s tag from %s", tag, path) # Creating the dataframe dataframe = collections.OrderedDict({ 'train_tag': [], 'et_bin': [], 'eta_bin': [], 'model_idx': [], 'sort': [], 'init': [], 'file_name': [], 'tuned_idx': [], 'op_name': [], }) MSG_INFO(self, 'There are %i files for this task...' % (len(paths))) MSG_INFO(self, 'Filling the table... ') for ituned_file_name in progressbar(paths, 'Reading %s...' % tag): #for ituned_file_name in paths: try: gfile = load(ituned_file_name) except: #MSG_WARNING(self, "File %s not open. skip.", ituned_file_name) continue tuned_file = gfile['tunedData'] for idx, ituned in enumerate(tuned_file): history = ituned['history'] for op, config_dict in self.__config_dict.items(): # get the basic from model dataframe['train_tag'].append(tag) dataframe['model_idx'].append(ituned['imodel']) dataframe['sort'].append(ituned['sort']) dataframe['init'].append(ituned['init']) dataframe['et_bin'].append( self.get_etbin(ituned_file_name)) dataframe['eta_bin'].append( self.get_etabin(ituned_file_name)) dataframe['file_name'].append(ituned_file_name) dataframe['tuned_idx'].append(idx) dataframe['op_name'].append(op) # Get the value for each wanted key passed by the user in the contructor args. for key, local in config_dict.items(): if not key in dataframe.keys(): dataframe[key] = [self.__get_value(history, local)] else: dataframe[key].append( self.__get_value(history, local)) # append tables if is need # ignoring index to avoid duplicated entries in dataframe self.__table = self.__table.append( pd.DataFrame(dataframe), ignore_index=True ) if not self.__table is None else pd.DataFrame(dataframe) MSG_INFO(self, 'End of fill step, a pandas DataFrame was created...')
def getJobConfigId(path): from Gaugi import load return dict(load(path))['id']
def generator(path): def norm1(data): norms = np.abs(data.sum(axis=1)) norms[norms == 0] = 1 return data / norms[:, None] from Gaugi import load import numpy as np d = load(path) feature_names = d['features'].tolist() #n = d['data'].shape[0] # extract all shower shapes #data_reta = d['data'][:, feature_names.index('L2Calo_reta')].reshape((n,1)) / 1.0 #data_eratio = d['data'][:, feature_names.index('L2Calo_eratio')].reshape((n,1)) / 1.0 #data_f1 = d['data'][:, feature_names.index('L2Calo_f1')].reshape((n,1)) / 0.6 #data_f3 = d['data'][:, feature_names.index('f3')].reshape((n,1)) / 0.04 #data_weta2 = d['data'][:, feature_names.index('weta2')].reshape((n,1)) / 0.02 #data_wstot = d['data'][:, feature_names.index('wtots1')].reshape((n,1)) / 1.0 #target = d['target'] # Fix all shower shapes variables #print( 'eratio > [10,inf[ = %d'%len(data_eratio[data_eratio>10.0]) ) #data_eratio[data_eratio>10.0]=0 #print( 'eratio > [1,10[ = %d'%len(data_eratio[data_eratio>1.0]) ) #data_eratio[data_eratio>1.]=1.0 #print ('wstor < -99 = %d'%len(data_wstot[data_wstot<-99])) #data_wstot[data_wstot<-99]=0 # This is mandatory #splits = [(train_index, val_index) for train_index, val_index in cv.split(data_reta,target)] #dataRings = norm1(d['data'][:,1:101]) #data_shower = np.concatenate( (data_reta,data_eratio,data_f1,data_f3,data_weta2, data_wstot), axis=1) #dataSS = np.transpose(data_shower) #data = np.concatenate((dataRings,data_shower),axis=1) data = norm1(d['data'][:, 1:101]) #data = norm1(d['data'][:,1:101]) target = d['target'] avgmu = d['data'][:, 0] references = [ 'T0HLTElectronT2CaloTight', 'T0HLTElectronT2CaloMedium', 'T0HLTElectronT2CaloLoose', 'T0HLTElectronT2CaloVLoose' ] ref_dict = {} for ref in references: answers = d['data'][:, feature_names.index(ref)] signal_passed = sum(answers[target == 1]) signal_total = len(answers[target == 1]) background_passed = sum(answers[target == 0]) background_total = len(answers[target == 0]) pd = signal_passed / signal_total fa = background_passed / background_total ref_dict[ref] = { 'signal_passed': signal_passed, 'signal_total': signal_total, 'pd': pd, 'background_passed': background_passed, 'background_total': background_total, 'fa': fa } return data, target, avgmu
def getPileup(path): from Gaugi import load return load(path)['data'][:, 0]