def load(self, fList): from Gaugi import load from Gaugi import csvStr2List, expandFolders, progressbar fList = csvStr2List(fList) fList = expandFolders(fList) from saphyra import TunedData_v1 self._obj = TunedData_v1() for inputFile in progressbar(fList, len(fList), prefix="Reading tuned data collection...", logger=self._logger): raw = load(inputFile) # get the file version version = raw['__version'] # the current file version if version == 1: obj = TunedData_v1.fromRawObj(raw) self._obj.merge(obj) else: # error because the file does not exist self._logger.fatal('File version (%d) not supported in (%s)', version, inputFile) # return the list of keras models return self._obj
def fill(self, path, tag): ''' This method will fill the information dictionary and convert then into a pandas DataFrame. Arguments.: - path: the path to the tuned files; - tag: the training tag used; ''' paths = expandFolders( path ) MSG_INFO(self, "Reading file for %s tag from %s", tag , path) # Creating the dataframe dataframe = collections.OrderedDict({ 'train_tag' : [], 'et_bin' : [], 'eta_bin' : [], 'model_idx' : [], 'sort' : [], 'init' : [], 'file_name' : [], 'tuned_idx' : [], }) # Complete the dataframe for each varname in the config dict for varname in self.__config_dict.keys(): dataframe[varname] = [] MSG_INFO(self, 'There are %i files for this task...' %(len(paths))) MSG_INFO(self, 'Filling the table... ') for ituned_file_name in paths: gfile = load(ituned_file_name) tuned_file = gfile['tunedData'] for idx, ituned in enumerate(tuned_file): history = ituned['history'] #model = model_from_json( json.dumps(ituned['sequence'], separators=(',', ':')) , custom_objects={'RpLayer':RpLayer} ) #model.set_weights( ituned['weights'] ) # get the basic from model dataframe['train_tag'].append(tag) #dataframe['model'].append(model) dataframe['model_idx'].append(ituned['imodel']) dataframe['sort'].append(ituned['sort']) dataframe['init'].append(ituned['init']) dataframe['et_bin'].append(self.get_etbin(ituned_file_name)) dataframe['eta_bin'].append(self.get_etabin(ituned_file_name)) dataframe['file_name'].append(ituned_file_name) dataframe['tuned_idx'].append( idx ) # Get the value for each wanted key passed by the user in the contructor args. for key, local in self.__config_dict.items(): dataframe[key].append( self.__get_value( history, local ) ) # append tables if is need # ignoring index to avoid duplicated entries in dataframe self.__table = self.__table.append( pd.DataFrame(dataframe), ignore_index=True ) if not self.__table is None else pd.DataFrame(dataframe) MSG_INFO(self, 'End of fill step, a pandas DataFrame was created...')
def __init__(self, fList): Logger.__init__(self) from Gaugi import csvStr2List from Gaugi import expandFolders self.fList = csvStr2List(fList) self.fList = expandFolders(fList) self.process_pipe = [] self.output_stack = [] import random import time random.seed(time.time()) self._base_id = random.randrange(100000)
def fill(self, path, tag): paths = expandFolders(path) MSG_INFO(self, "Reading file for %s tag from %s", tag, path) # Creating the dataframe dataframe = collections.OrderedDict({ 'train_tag': [], 'et_bin': [], 'eta_bin': [], 'model_idx': [], 'sort': [], 'init': [], 'file_name': [], 'tuned_idx': [], }) # Complete the dataframe for each varname in the config dict for varname in self.__config_dict.keys(): dataframe[varname] = [] MSG_INFO(self, 'There are %i files for this task...' % (len(paths))) MSG_INFO(self, 'Filling the table... ') for ituned_file_name in paths: gfile = load(ituned_file_name) tuned_file = gfile['tunedData'] for idx, ituned in enumerate(tuned_file): history = ituned['history'] # get the basic from model dataframe['train_tag'].append(tag) dataframe['model_idx'].append(ituned['imodel']) dataframe['sort'].append(ituned['sort']) dataframe['init'].append(ituned['init']) dataframe['et_bin'].append(self.get_etbin(ituned_file_name)) dataframe['eta_bin'].append(self.get_etabin(ituned_file_name)) dataframe['file_name'].append(ituned_file_name) dataframe['tuned_idx'].append(idx) # Get the value for each wanted key passed by the user in the contructor args. for key, local in self.__config_dict.items(): dataframe[key].append(self.__get_value(history, local)) self.__table = self.__table.append( pd.DataFrame(dataframe) ) if not self.__table is None else pd.DataFrame(dataframe) MSG_INFO(self, 'End of fill step, a pandas DataFrame was created...')
def registry(self, datasetname, path): # check task policy if datasetname.split('.')[0] != 'user': return (StatusCode.FATAL, 'The dataset name must starts with: user.%USER.taskname.') username = datasetname.split('.')[1] if not username in [ user.getUserName() for user in self.__db.getAllUsers() ]: return ( StatusCode.FATAL, 'The username does not exist into the database. Please, report this to the db manager...' ) if self.__db.getDataset(username, datasetname): return (StatusCode.FATAL, "The dataset exist into the database") # Let's registry and upload into the database try: # Create the new dataset ds = Dataset(id=self.__db.generateId(Dataset), username=username, dataset=datasetname) # If dir doesn't exist, creates it if not os.path.exists(path): return (StatusCode.FATAL, "The path (%s) does not exist." % path) # Loop over files desired_id = self.__db.generateId(File) + 1 for idx, subpath in enumerate(expandFolders(path)): MSG_INFO(self, "Registry %s into %s", subpath, datasetname) file = File(path=subpath, id=desired_id + idx) ds.addFile(file) self.__db.session().add(ds) self.__db.commit() except Exception as e: MSG_ERROR(self, e) return (StatusCode.FATAL, "Impossible to registry the dataset(%s)." % datasetname) return (StatusCode.SUCCESS, "Successfully uploaded.")
def __init__(self, fList, reader, nFilesPerJob, nthreads): Logger.__init__(self) from Gaugi import csvStr2List from Gaugi import expandFolders fList = csvStr2List(fList) self._fList = expandFolders(fList) def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] self._fList = [l for l in chunks(self._fList, nFilesPerJob)] self.process_pipe = [] self._outputs = [] self._nthreads = nthreads self._reader = reader
def load(self, basepath, model_idx): ''' This method will open all the histories that was grouped in the initialize method, and put then into a dictionary in order to make easier to manipulate. Usually used for histories dumped from best inists table ''' paths = expandFolders(basepath) MSG_INFO(self, "Reading %d files...", len(paths)) h_dict = dict() for path in paths: with open(path) as f: obj = dict(eval(json.load(f))) key = 'et%d_eta%d_sort_%d' % (obj['loc']['et_bin'], obj['loc']['eta_bin'], obj['loc']['sort']) if obj['loc']['model_idx'] != model_idx: continue h_dict[key] = obj return h_dict
import re from Gaugi import expandFolders paths = expandFolders('JF17/') pat = re.compile(r'.+(?P<binID>et(?P<etBinIdx>\d+).eta(?P<etaBinIdx>\d+))\..+$') jobIDs = sorted(list(set([pat.match(f).group('binID') for f in paths if pat.match(f) is not None]))) print jobIDs
def repro(self, volume, new_taskname, dataFile, old_taskname, secondaryDS, execCommand, queue='gpu', dry_run=False): # check task policy (user.username) if new_taskname.split('.')[0] != 'user': return (StatusCode.FATAL, 'The task name must starts with user.$USER.taskname.') # check task policy (username must exist into the database) username = new_taskname.split('.')[1] if not username in [ user.getUserName() for user in self.__db.getAllUsers() ]: return (StatusCode.FATAL, 'The username does not exist into the database.') if self.__db.getUser(username).getTask(new_taskname) is not None: return (StatusCode.FATAL, "The task exist into the database. Abort.") # # Check if all datasets are registered into the database # if self.__db.getDataset(username, dataFile) is None: return ( StatusCode.FATAL, "The file (%s) does not exist into the database. Should be registry first." % dataFile) if self.__db.getUser(username).getTask(old_taskname) is None: return (StatusCode.FATAL, "The task file (%s) does not exist into the database." % old_taskname) secondaryDS = eval(secondaryDS) for key in secondaryDS.keys(): if self.__db.getDataset(username, secondaryDS[key]) is None: return ( StatusCode.FATAL, "The secondary data file (%s) does not exist into the database. Should be registry first." % secondaryDS[key]) # # check exec command policy # if (not '%DATA' in execCommand): return ( StatusCode.FATAL, "The exec command must include '%DATA' into the string. This will substitute to the dataFile when start." ) if (not '%IN' in execCommand): return ( StatusCode.FATAL, "The exec command must include '%IN' into the string. This will substitute to the configFile when start." ) if not '%OUT' in execCommand: return ( StatusCode.FATAL, "The exec command must include '%OUT' into the string. This will substitute to the outputFile when start." ) for key in secondaryDS.keys(): if not key in execCommand: return (StatusCode.FATAL, ( "The exec command must include %s into the string. This will substitute to %s when start" ) % (key, secondaryDS[key])) # # Create the output file # outputFile = volume + '/' + new_taskname if os.path.exists(outputFile): MSG_WARNING(self, "The task dir exist into the storage. Beware!") else: # create the task dir MSG_INFO(self, "Creating the task dir in %s", outputFile) os.system('mkdir -p %s ' % (outputFile)) # # create the task into the database # if not dry_run: try: user = self.__db.getUser(username) task = self.__db.createTask(user, new_taskname, old_taskname, dataFile, outputFile, "", secondaryDataPath=secondaryDS, templateExecArgs=execCommand, queueName=queue) task.setSignal(Signal.WAITING) task.setStatus(Status.HOLD) tunedFiles = expandFolders( self.__db.getUser(username).getTask( old_taskname).getTheOutputStoragePath()) _dataFile = self.__db.getDataset( username, dataFile).getAllFiles()[0].getPath() _secondaryDS = {} for key in secondaryDS.keys(): _secondaryDS[key] = self.__db.getDataset( username, secondaryDS[key]).getAllFiles()[0].getPath() for idx, _tunedFile in enumerate(tunedFiles): _outputFile = outputFile + '/job_configId_%d' % idx command = execCommand command = command.replace('%DATA', _dataFile) command = command.replace('%IN', _tunedFile) command = command.replace('%OUT', _outputFile) for key in _secondaryDS: command = command.replace(key, _secondaryDS[key]) job = self.__db.createJob(task, _tunedFile, idx, execArgs=command, priority=-1) task.setStatus('registered') self.__db.commit() except Exception as e: MSG_ERROR(self, e) return (StatusCode.FATAL, "Unknown error.") return (StatusCode.SUCCESS, "Succefully created.")
try: os.mkdir(dirpath) except: mainLogger.warning("The output directory %s exist into the local path", args.outputDir) if not args.legends: args.legends = ['(Ref)', '(Test)'] ### Get all files if needed! files_ref = [] files_test = [] if args.reference: for paths in args.reference: files_ref.extend(expandFolders(paths)) for paths in args.test: files_test.extend(expandFolders(paths)) if args.debug: if len(files_ref) > 10: files_ref = files_ref[0:10] if len(files_test) > 10: files_test = files_test[0:10] from pprint import pprint ### Get all trigger for each group triggerList = [] for group in triggerList_group: if type(group) is tuple: for t in group:
def __call__(self, sgnFileList, bkgFileList, ofile, dump_csv=False): # get all keys paths = expandFolders(sgnFileList) jobIDs = sorted( list( set([ self._pat.match(f).group('binID') for f in paths if self._pat.match(f) is not None ]))) npatterns = {} etBins = None etaBins = None debug = False for id in jobIDs: sgnSubFileList = [] for f in expandFolders(sgnFileList): if id in f: sgnSubFileList.append(f) if debug: sgnSubFileList = sgnSubFileList[0:11] reader = ReaderPool(sgnSubFileList, DataReader(self._skip_these_keys), self._nFilesPerJob, self._nthreads) MSG_INFO(self, "Reading signal files...") outputs = reader() sgnDict = outputs.pop() if len(outputs) > 0: for from_dict in progressbar(outputs, len(outputs), 'Mearging signal files: ', 60, logger=self._logger): DataReader.merge(from_dict, sgnDict, self._skip_these_keys) bkgSubFileList = [] for f in expandFolders(bkgFileList): if id in f: bkgSubFileList.append(f) if debug: bkgSubFileList = bkgSubFileList[0:11] reader = ReaderPool(bkgSubFileList, DataReader(self._skip_these_keys), self._nFilesPerJob, self._nthreads) MSG_INFO(self, "Reading background files...") outputs = reader() bkgDict = outputs.pop() if len(outputs) > 0: for from_dict in progressbar(outputs, len(outputs), 'Mearging background files: ', 60, logger=self._logger): DataReader.merge(from_dict, bkgDict, self._skip_these_keys) # Loop over regions d = { "features": sgnDict["features"], "etBins": sgnDict["etBins"], "etaBins": sgnDict["etaBins"], "etBinIdx": sgnDict["etBinIdx"], "etaBinIdx": sgnDict["etaBinIdx"], } #if not etBins: etBins = sgnDict["etBins"] etBins = sgnDict["etBins"] #if not etaBins: etaBins = sgnDict["etaBins"] etaBins = sgnDict["etaBins"] d['data'] = np.concatenate( (sgnDict['pattern_' + id], bkgDict['pattern_' + id])).astype('float32') d['target'] = np.concatenate( (np.ones((sgnDict['pattern_' + id].shape[0], )), np.zeros( (bkgDict['pattern_' + id].shape[0], )))).astype('int16') if sgnDict['pattern_' + id] is not None: MSG_INFO(self, 'sgnData_%s : (%d, %d)', id, sgnDict['pattern_' + id].shape[0], sgnDict['pattern_' + id].shape[1]) else: MSG_INFO(self, 'sgnData_%s : empty', id) if bkgDict['pattern_' + id] is not None: MSG_INFO(self, 'bkgData_%s : (%d, %d)', id, bkgDict['pattern_' + id].shape[0], bkgDict['pattern_' + id].shape[1]) else: MSG_INFO(self, 'bkgData_%s : empty', id) MSG_INFO(self, "Saving: %s", ofile + '_' + id) npatterns['sgnPattern_' + id] = int(sgnDict['pattern_' + id].shape[0]) npatterns['bkgPattern_' + id] = int(bkgDict['pattern_' + id].shape[0]) save(d, ofile + '_' + id, protocol='savez_compressed') if dump_csv: # Save as csv for pandas dd = {} for ikey, key in enumerate(d['features']): dd[key] = d['data'][:, ikey] dd['target'] = d['target'] df = pd.DataFrame(dd) df.to_csv(ofile + '_' + id + '.csv') self.plotNSamples(npatterns, etBins, etaBins)
args = parser.parse_args() ###################################################################################################### # definitions level_names = ['L1Calo','L2Calo','L2','EFCalo','HLT'] plot_names = ['et','eta','mu'] xlabel_names = ['Offline isolated electron E_{T} [GeV]','#eta','<#mu>'] triggerList = eval(args.triggers) ### Get all files if needed! files=[]; is_emulated_trigger = []; legends = [] for idx, basepath in enumerate(args.dirs): mainLogger.info( basepath ) f = expandFolders( basepath ) if len(f)>10 and args.debug: f=f[0:10] files.append(f) is_emulated_trigger.append(False) legends.append(args.legends[idx] if args.legends else str()) if args.emulation_list: for idx in args.emulation_list: is_emulated_trigger[idx]=True localpath=os.getcwd() dirpath=args.outputDir
def GetHistogramFromMany(basepath, paths, keys, prefix='Loading...', logger=None): from Gaugi import progressbar, expandFolders from copy import deepcopy # internal open function def Open(path): from ROOT import TFile f = TFile(path, 'read') if len(f.GetListOfKeys()) > 0: run_numbers = [key.GetName() for key in f.GetListOfKeys()] return f, run_numbers else: return f, None # internal close function def Close(f): f.Close() del f # internal retrive histogram def GetHistogram(f, run_number, path, logger=None): try: hist = f.Get(run_number + '/' + path) hist.GetEntries() return hist except: return None # internal integration def SumHists(histList): totalHist = None for hist in histList: if hist is None: continue if totalHist is None: totalHist = deepcopy(hist.Clone()) else: totalHist.Add(hist) return totalHist files = expandFolders(basepath) hists = {} for f in progressbar(files, len(files), prefix=prefix, logger=logger): try: _f, _run_numbers = Open(f) except: continue if _run_numbers is None: continue for idx, _path in enumerate(paths): for _run_number in _run_numbers: hist = GetHistogram(_f, _run_number, _path) if (hist is not None): if not keys[idx] in hists.keys(): hists[keys[idx]] = [deepcopy(hist.Clone())] else: hists[keys[idx]].append(deepcopy(hist.Clone())) Close(_f) for key in hists.keys(): hists[key] = SumHists(hists[key]) #from pprint import pprint #pprint(hists) return hists