Beispiel #1
0
    def load(self, fList):
        from Gaugi import load
        from Gaugi import csvStr2List, expandFolders, progressbar
        fList = csvStr2List(fList)
        fList = expandFolders(fList)
        from saphyra import TunedData_v1
        self._obj = TunedData_v1()

        for inputFile in progressbar(fList,
                                     len(fList),
                                     prefix="Reading tuned data collection...",
                                     logger=self._logger):

            raw = load(inputFile)
            # get the file version
            version = raw['__version']
            # the current file version
            if version == 1:
                obj = TunedData_v1.fromRawObj(raw)
                self._obj.merge(obj)
            else:
                # error because the file does not exist
                self._logger.fatal('File version (%d) not supported in (%s)',
                                   version, inputFile)

        # return the list of keras models
        return self._obj
Beispiel #2
0
    def fill(self, path, tag):
        '''
        This method will fill the information dictionary and convert then into a pandas DataFrame.

        Arguments.:

        - path: the path to the tuned files;
        - tag: the training tag used;
        '''
        paths = expandFolders( path )
        MSG_INFO(self, "Reading file for %s tag from %s", tag , path)

        # Creating the dataframe
        dataframe = collections.OrderedDict({
                              'train_tag'      : [],
                              'et_bin'         : [],
                              'eta_bin'        : [],
                              'model_idx'      : [],
                              'sort'           : [],
                              'init'           : [],
                              'file_name'      : [],
                              'tuned_idx'      : [],
                          })


        # Complete the dataframe for each varname in the config dict
        for varname in self.__config_dict.keys():
            dataframe[varname] = []

        MSG_INFO(self, 'There are %i files for this task...' %(len(paths)))
        MSG_INFO(self, 'Filling the table... ')

        for ituned_file_name in paths:
            gfile = load(ituned_file_name)
            tuned_file = gfile['tunedData']

            for idx, ituned in enumerate(tuned_file):
                history = ituned['history']
                #model = model_from_json( json.dumps(ituned['sequence'], separators=(',', ':')) , custom_objects={'RpLayer':RpLayer} )
                #model.set_weights( ituned['weights'] )

                # get the basic from model
                dataframe['train_tag'].append(tag)
                #dataframe['model'].append(model)
                dataframe['model_idx'].append(ituned['imodel'])
                dataframe['sort'].append(ituned['sort'])
                dataframe['init'].append(ituned['init'])
                dataframe['et_bin'].append(self.get_etbin(ituned_file_name))
                dataframe['eta_bin'].append(self.get_etabin(ituned_file_name))
                dataframe['file_name'].append(ituned_file_name)
                dataframe['tuned_idx'].append( idx )

                # Get the value for each wanted key passed by the user in the contructor args.
                for key, local  in self.__config_dict.items():
                    dataframe[key].append( self.__get_value( history, local ) )

        # append tables if is need
        # ignoring index to avoid duplicated entries in dataframe
        self.__table = self.__table.append( pd.DataFrame(dataframe), ignore_index=True ) if not self.__table is None else pd.DataFrame(dataframe)
        MSG_INFO(self, 'End of fill step, a pandas DataFrame was created...')
Beispiel #3
0
    def __init__(self, fList):

        Logger.__init__(self)
        from Gaugi import csvStr2List
        from Gaugi import expandFolders
        self.fList = csvStr2List(fList)
        self.fList = expandFolders(fList)
        self.process_pipe = []
        self.output_stack = []
        import random
        import time
        random.seed(time.time())
        self._base_id = random.randrange(100000)
Beispiel #4
0
    def fill(self, path, tag):

        paths = expandFolders(path)
        MSG_INFO(self, "Reading file for %s tag from %s", tag, path)

        # Creating the dataframe
        dataframe = collections.OrderedDict({
            'train_tag': [],
            'et_bin': [],
            'eta_bin': [],
            'model_idx': [],
            'sort': [],
            'init': [],
            'file_name': [],
            'tuned_idx': [],
        })

        # Complete the dataframe for each varname in the config dict
        for varname in self.__config_dict.keys():
            dataframe[varname] = []

        MSG_INFO(self, 'There are %i files for this task...' % (len(paths)))
        MSG_INFO(self, 'Filling the table... ')

        for ituned_file_name in paths:
            gfile = load(ituned_file_name)
            tuned_file = gfile['tunedData']

            for idx, ituned in enumerate(tuned_file):
                history = ituned['history']
                # get the basic from model
                dataframe['train_tag'].append(tag)
                dataframe['model_idx'].append(ituned['imodel'])
                dataframe['sort'].append(ituned['sort'])
                dataframe['init'].append(ituned['init'])
                dataframe['et_bin'].append(self.get_etbin(ituned_file_name))
                dataframe['eta_bin'].append(self.get_etabin(ituned_file_name))
                dataframe['file_name'].append(ituned_file_name)
                dataframe['tuned_idx'].append(idx)
                # Get the value for each wanted key passed by the user in the contructor args.
                for key, local in self.__config_dict.items():
                    dataframe[key].append(self.__get_value(history, local))

        self.__table = self.__table.append(
            pd.DataFrame(dataframe)
        ) if not self.__table is None else pd.DataFrame(dataframe)
        MSG_INFO(self, 'End of fill step, a pandas DataFrame was created...')
Beispiel #5
0
    def registry(self, datasetname, path):

        # check task policy
        if datasetname.split('.')[0] != 'user':
            return (StatusCode.FATAL,
                    'The dataset name must starts with: user.%USER.taskname.')

        username = datasetname.split('.')[1]

        if not username in [
                user.getUserName() for user in self.__db.getAllUsers()
        ]:
            return (
                StatusCode.FATAL,
                'The username does not exist into the database. Please, report this to the db manager...'
            )

        if self.__db.getDataset(username, datasetname):
            return (StatusCode.FATAL, "The dataset exist into the database")

        # Let's registry and upload into the database
        try:
            # Create the new dataset
            ds = Dataset(id=self.__db.generateId(Dataset),
                         username=username,
                         dataset=datasetname)

            # If dir doesn't exist, creates it
            if not os.path.exists(path):
                return (StatusCode.FATAL,
                        "The path (%s) does not exist." % path)

            # Loop over files
            desired_id = self.__db.generateId(File) + 1
            for idx, subpath in enumerate(expandFolders(path)):
                MSG_INFO(self, "Registry %s into %s", subpath, datasetname)
                file = File(path=subpath, id=desired_id + idx)
                ds.addFile(file)

            self.__db.session().add(ds)
            self.__db.commit()
        except Exception as e:
            MSG_ERROR(self, e)
            return (StatusCode.FATAL,
                    "Impossible to registry the dataset(%s)." % datasetname)

        return (StatusCode.SUCCESS, "Successfully uploaded.")
Beispiel #6
0
    def __init__(self, fList, reader, nFilesPerJob, nthreads):

        Logger.__init__(self)
        from Gaugi import csvStr2List
        from Gaugi import expandFolders
        fList = csvStr2List(fList)
        self._fList = expandFolders(fList)

        def chunks(l, n):
            """Yield successive n-sized chunks from l."""
            for i in range(0, len(l), n):
                yield l[i:i + n]

        self._fList = [l for l in chunks(self._fList, nFilesPerJob)]
        self.process_pipe = []
        self._outputs = []
        self._nthreads = nthreads
        self._reader = reader
Beispiel #7
0
 def load(self, basepath, model_idx):
     '''
     This method will open all the histories that was grouped in the initialize method,
     and put then into a dictionary in order to make easier to manipulate. Usually used
     for histories dumped from best inists table
     '''
     paths = expandFolders(basepath)
     MSG_INFO(self, "Reading %d files...", len(paths))
     h_dict = dict()
     for path in paths:
         with open(path) as f:
             obj = dict(eval(json.load(f)))
             key = 'et%d_eta%d_sort_%d' % (obj['loc']['et_bin'],
                                           obj['loc']['eta_bin'],
                                           obj['loc']['sort'])
             if obj['loc']['model_idx'] != model_idx:
                 continue
             h_dict[key] = obj
     return h_dict
Beispiel #8
0
import re
from Gaugi import expandFolders

paths = expandFolders('JF17/')
pat = re.compile(r'.+(?P<binID>et(?P<etBinIdx>\d+).eta(?P<etaBinIdx>\d+))\..+$')
jobIDs = sorted(list(set([pat.match(f).group('binID')  for f in paths if pat.match(f) is not None]))) 


print jobIDs
Beispiel #9
0
    def repro(self,
              volume,
              new_taskname,
              dataFile,
              old_taskname,
              secondaryDS,
              execCommand,
              queue='gpu',
              dry_run=False):

        # check task policy (user.username)
        if new_taskname.split('.')[0] != 'user':
            return (StatusCode.FATAL,
                    'The task name must starts with user.$USER.taskname.')

        # check task policy (username must exist into the database)
        username = new_taskname.split('.')[1]
        if not username in [
                user.getUserName() for user in self.__db.getAllUsers()
        ]:
            return (StatusCode.FATAL,
                    'The username does not exist into the database.')

        if self.__db.getUser(username).getTask(new_taskname) is not None:
            return (StatusCode.FATAL,
                    "The task exist into the database. Abort.")

        #
        # Check if all datasets are registered into the database
        #

        if self.__db.getDataset(username, dataFile) is None:
            return (
                StatusCode.FATAL,
                "The file (%s) does not exist into the database. Should be registry first."
                % dataFile)

        if self.__db.getUser(username).getTask(old_taskname) is None:
            return (StatusCode.FATAL,
                    "The task file (%s) does not exist into the database." %
                    old_taskname)

        secondaryDS = eval(secondaryDS)
        for key in secondaryDS.keys():
            if self.__db.getDataset(username, secondaryDS[key]) is None:
                return (
                    StatusCode.FATAL,
                    "The secondary data file (%s) does not exist into the database. Should be registry first."
                    % secondaryDS[key])

        #
        # check exec command policy
        #
        if (not '%DATA' in execCommand):
            return (
                StatusCode.FATAL,
                "The exec command must include '%DATA' into the string. This will substitute to the dataFile when start."
            )

        if (not '%IN' in execCommand):
            return (
                StatusCode.FATAL,
                "The exec command must include '%IN' into the string. This will substitute to the configFile when start."
            )

        if not '%OUT' in execCommand:
            return (
                StatusCode.FATAL,
                "The exec command must include '%OUT' into the string. This will substitute to the outputFile when start."
            )

        for key in secondaryDS.keys():
            if not key in execCommand:
                return (StatusCode.FATAL, (
                    "The exec command must include %s into the string. This will substitute to %s when start"
                ) % (key, secondaryDS[key]))

        #
        # Create the output file
        #
        outputFile = volume + '/' + new_taskname

        if os.path.exists(outputFile):
            MSG_WARNING(self, "The task dir exist into the storage. Beware!")
        else:
            # create the task dir
            MSG_INFO(self, "Creating the task dir in %s", outputFile)
            os.system('mkdir -p %s ' % (outputFile))

        #
        # create the task into the database
        #
        if not dry_run:
            try:
                user = self.__db.getUser(username)

                task = self.__db.createTask(user,
                                            new_taskname,
                                            old_taskname,
                                            dataFile,
                                            outputFile,
                                            "",
                                            secondaryDataPath=secondaryDS,
                                            templateExecArgs=execCommand,
                                            queueName=queue)

                task.setSignal(Signal.WAITING)
                task.setStatus(Status.HOLD)

                tunedFiles = expandFolders(
                    self.__db.getUser(username).getTask(
                        old_taskname).getTheOutputStoragePath())

                _dataFile = self.__db.getDataset(
                    username, dataFile).getAllFiles()[0].getPath()

                _secondaryDS = {}

                for key in secondaryDS.keys():
                    _secondaryDS[key] = self.__db.getDataset(
                        username, secondaryDS[key]).getAllFiles()[0].getPath()

                for idx, _tunedFile in enumerate(tunedFiles):

                    _outputFile = outputFile + '/job_configId_%d' % idx

                    command = execCommand
                    command = command.replace('%DATA', _dataFile)
                    command = command.replace('%IN', _tunedFile)
                    command = command.replace('%OUT', _outputFile)

                    for key in _secondaryDS:
                        command = command.replace(key, _secondaryDS[key])

                    job = self.__db.createJob(task,
                                              _tunedFile,
                                              idx,
                                              execArgs=command,
                                              priority=-1)

                task.setStatus('registered')
                self.__db.commit()
            except Exception as e:
                MSG_ERROR(self, e)
                return (StatusCode.FATAL, "Unknown error.")

        return (StatusCode.SUCCESS, "Succefully created.")
Beispiel #10
0
try:
    os.mkdir(dirpath)
except:
    mainLogger.warning("The output directory %s exist into the local path",
                       args.outputDir)

if not args.legends:
    args.legends = ['(Ref)', '(Test)']

### Get all files if needed!
files_ref = []
files_test = []

if args.reference:
    for paths in args.reference:
        files_ref.extend(expandFolders(paths))

for paths in args.test:
    files_test.extend(expandFolders(paths))

if args.debug:
    if len(files_ref) > 10: files_ref = files_ref[0:10]
    if len(files_test) > 10: files_test = files_test[0:10]

from pprint import pprint

### Get all trigger for each group
triggerList = []
for group in triggerList_group:
    if type(group) is tuple:
        for t in group:
Beispiel #11
0
    def __call__(self, sgnFileList, bkgFileList, ofile, dump_csv=False):

        # get all keys
        paths = expandFolders(sgnFileList)
        jobIDs = sorted(
            list(
                set([
                    self._pat.match(f).group('binID') for f in paths
                    if self._pat.match(f) is not None
                ])))
        npatterns = {}
        etBins = None
        etaBins = None

        debug = False

        for id in jobIDs:

            sgnSubFileList = []
            for f in expandFolders(sgnFileList):
                if id in f: sgnSubFileList.append(f)

            if debug:
                sgnSubFileList = sgnSubFileList[0:11]

            reader = ReaderPool(sgnSubFileList,
                                DataReader(self._skip_these_keys),
                                self._nFilesPerJob, self._nthreads)
            MSG_INFO(self, "Reading signal files...")
            outputs = reader()
            sgnDict = outputs.pop()
            if len(outputs) > 0:
                for from_dict in progressbar(outputs,
                                             len(outputs),
                                             'Mearging signal files: ',
                                             60,
                                             logger=self._logger):
                    DataReader.merge(from_dict, sgnDict, self._skip_these_keys)

            bkgSubFileList = []
            for f in expandFolders(bkgFileList):
                if id in f: bkgSubFileList.append(f)

            if debug:
                bkgSubFileList = bkgSubFileList[0:11]

            reader = ReaderPool(bkgSubFileList,
                                DataReader(self._skip_these_keys),
                                self._nFilesPerJob, self._nthreads)
            MSG_INFO(self, "Reading background files...")
            outputs = reader()
            bkgDict = outputs.pop()
            if len(outputs) > 0:
                for from_dict in progressbar(outputs,
                                             len(outputs),
                                             'Mearging background files: ',
                                             60,
                                             logger=self._logger):
                    DataReader.merge(from_dict, bkgDict, self._skip_these_keys)

            # Loop over regions
            d = {
                "features": sgnDict["features"],
                "etBins": sgnDict["etBins"],
                "etaBins": sgnDict["etaBins"],
                "etBinIdx": sgnDict["etBinIdx"],
                "etaBinIdx": sgnDict["etaBinIdx"],
            }

            #if not etBins:  etBins = sgnDict["etBins"]
            etBins = sgnDict["etBins"]
            #if not etaBins:  etaBins = sgnDict["etaBins"]
            etaBins = sgnDict["etaBins"]

            d['data'] = np.concatenate(
                (sgnDict['pattern_' + id],
                 bkgDict['pattern_' + id])).astype('float32')
            d['target'] = np.concatenate(
                (np.ones((sgnDict['pattern_' + id].shape[0], )),
                 np.zeros(
                     (bkgDict['pattern_' + id].shape[0], )))).astype('int16')

            if sgnDict['pattern_' + id] is not None:
                MSG_INFO(self, 'sgnData_%s : (%d, %d)', id,
                         sgnDict['pattern_' + id].shape[0],
                         sgnDict['pattern_' + id].shape[1])
            else:
                MSG_INFO(self, 'sgnData_%s : empty', id)
            if bkgDict['pattern_' + id] is not None:
                MSG_INFO(self, 'bkgData_%s : (%d, %d)', id,
                         bkgDict['pattern_' + id].shape[0],
                         bkgDict['pattern_' + id].shape[1])
            else:
                MSG_INFO(self, 'bkgData_%s : empty', id)
            MSG_INFO(self, "Saving: %s", ofile + '_' + id)

            npatterns['sgnPattern_' + id] = int(sgnDict['pattern_' +
                                                        id].shape[0])
            npatterns['bkgPattern_' + id] = int(bkgDict['pattern_' +
                                                        id].shape[0])
            save(d, ofile + '_' + id, protocol='savez_compressed')

            if dump_csv:
                # Save as csv for pandas
                dd = {}
                for ikey, key in enumerate(d['features']):
                    dd[key] = d['data'][:, ikey]
                dd['target'] = d['target']
                df = pd.DataFrame(dd)
                df.to_csv(ofile + '_' + id + '.csv')

        self.plotNSamples(npatterns, etBins, etaBins)
Beispiel #12
0
args = parser.parse_args()

######################################################################################################

# definitions
level_names   = ['L1Calo','L2Calo','L2','EFCalo','HLT']
plot_names    = ['et','eta','mu']
xlabel_names  = ['Offline isolated electron E_{T} [GeV]','#eta','<#mu>']
triggerList = eval(args.triggers)

### Get all files if needed!
files=[]; is_emulated_trigger = []; legends = []
for idx, basepath in enumerate(args.dirs):
  mainLogger.info( basepath )
  f = expandFolders( basepath )
  if len(f)>10 and args.debug: f=f[0:10]
  files.append(f)
  is_emulated_trigger.append(False)
  legends.append(args.legends[idx] if args.legends else str())




if args.emulation_list:
  for idx in args.emulation_list:
    is_emulated_trigger[idx]=True


localpath=os.getcwd()
dirpath=args.outputDir
Beispiel #13
0
def GetHistogramFromMany(basepath,
                         paths,
                         keys,
                         prefix='Loading...',
                         logger=None):

    from Gaugi import progressbar, expandFolders
    from copy import deepcopy

    # internal open function
    def Open(path):
        from ROOT import TFile
        f = TFile(path, 'read')
        if len(f.GetListOfKeys()) > 0:
            run_numbers = [key.GetName() for key in f.GetListOfKeys()]
            return f, run_numbers
        else:
            return f, None

    # internal close function
    def Close(f):
        f.Close()
        del f

    # internal retrive histogram
    def GetHistogram(f, run_number, path, logger=None):
        try:
            hist = f.Get(run_number + '/' + path)
            hist.GetEntries()
            return hist

        except:
            return None

    # internal integration
    def SumHists(histList):
        totalHist = None
        for hist in histList:
            if hist is None:
                continue
            if totalHist is None:
                totalHist = deepcopy(hist.Clone())
            else:
                totalHist.Add(hist)
        return totalHist

    files = expandFolders(basepath)
    hists = {}
    for f in progressbar(files, len(files), prefix=prefix, logger=logger):
        try:
            _f, _run_numbers = Open(f)
        except:
            continue
        if _run_numbers is None:
            continue
        for idx, _path in enumerate(paths):
            for _run_number in _run_numbers:
                hist = GetHistogram(_f, _run_number, _path)
                if (hist is not None):
                    if not keys[idx] in hists.keys():
                        hists[keys[idx]] = [deepcopy(hist.Clone())]
                    else:
                        hists[keys[idx]].append(deepcopy(hist.Clone()))
        Close(_f)

    for key in hists.keys():
        hists[key] = SumHists(hists[key])
    #from pprint import pprint
    #pprint(hists)
    return hists