Beispiel #1
0
    def getDataset(self, **kwargs):
        '''Get all metadata of all datasets.
        Public method, not exported to GPI.'''

        db_view_column = ['dataset_id', 'creation_date', 'occupancy']
        sql = 'SELECT * FROM dataset_union WHERE true'
        kwargs['owner'] = kwargs.get('owner', ['official', utils.getOwner()])

        # add filter to query
        if len(kwargs) > 0:
            for key, value in kwargs.iteritems():
                if key in db_view_column:
                    sql += " AND %s ILIKE '%s%%'" % (key, value)
                elif key == 'files':
                    sql += " AND files > %s" % value
                elif key in ['status', 'session', 'owner']:
                    if not isinstance(value, list):
                        value = [value]

                    sql += " AND (false"
                    for s in value:
                        sql += " OR %s ILIKE '%s%%'" % (key, s)
                    sql += ")"

                else:
                    sql += " AND parameters->'%s' ILIKE '%s%%'" % (key, value)

        # clean up the query
        sql = sql.replace('false OR ', '')
        sql = sql.replace('true AND ', '')

        # TODO: add control to prevent sql injection
        datasets = db.read(sql)

        if len(datasets) == 0:
            raise GangaException('No dataset found')

        i = 0
        for dataset in datasets:
            dataset['id'] = i
            i += 1
            dataset['occupancy_human'] = utils.sizeof_fmt_binary(
                dataset['occupancy'])
            if 'evt_file' in dataset[
                    'parameters'] and not 'evt_tot' in dataset['parameters']:
                evt_file = int(dataset['parameters']['evt_file'])
                if dataset['files'] is None:
                    dataset['files'] = 0
                files = int(dataset['files'])
                dataset['parameters']['evt_tot'] = evt_file * files
            if 'evt_tot' in dataset['parameters']:
                dataset['parameters'][
                    'evt_tot_human'] = utils.sizeof_fmt_decimal(
                        int(dataset['parameters']['evt_tot']))

        return datasets
Beispiel #2
0
 def getDataset(self, **kwargs):
     '''Get all metadata of all datasets.
     Public method, not exported to GPI.'''
     
     db_view_column = ['dataset_id', 'creation_date', 'occupancy']
     sql = 'SELECT * FROM dataset_union WHERE true'
     kwargs['owner'] = kwargs.get('owner', ['official', utils.getOwner()])
     
     # add filter to query
     if len(kwargs) > 0:
         for key, value in kwargs.iteritems():
             if key in db_view_column:
                 sql += " AND %s ILIKE '%s%%'" % (key, value)
             elif key == 'files':
                 sql += " AND files > %s" % value
             elif key in ['status', 'session', 'owner']:
                 if not isinstance(value, list):
                     value = [value]
                 
                 sql += " AND (false"
                 for s in value:
                     sql += " OR %s ILIKE '%s%%'" % (key, s)
                 sql += ")"
                 
             else:
                 sql += " AND parameters->'%s' ILIKE '%s%%'" % (key, value)
     
     # clean up the query
     sql = sql.replace('false OR ', '')
     sql = sql.replace('true AND ', '')
     
     # TODO: add control to prevent sql injection
     datasets = db.read(sql)
     
     if len(datasets) == 0:
         raise GangaException('No dataset found')
     
     i = 0
     for dataset in datasets:
         dataset['id'] = i
         i += 1
         dataset['occupancy_human'] = utils.sizeof_fmt_binary(dataset['occupancy'])
         if 'evt_file' in dataset['parameters'] and not 'evt_tot' in dataset['parameters']:
             evt_file = int(dataset['parameters']['evt_file'])
             if dataset['files'] is None:
                 dataset['files'] = 0
             files = int(dataset['files'])
             dataset['parameters']['evt_tot'] = evt_file * files
         if 'evt_tot' in dataset['parameters']:
             dataset['parameters']['evt_tot_human'] = utils.sizeof_fmt_decimal(int(dataset['parameters']['evt_tot']))
     
     return datasets
Beispiel #3
0
 def getDataset(self, **kwargs):
     '''Interactive mathod. It prints the datasets (the user can apply filters),
     the user chooses one of them and inserts the number of events he wants.'''
     
     manager = SBDatasetManager.SBDatasetManager()
     
     def validateFilter(filter, allowed):
         kwargs[filter] = kwargs.get(filter, allowed)
         if not isinstance(kwargs[filter], list):
             kwargs[filter] = [kwargs[filter]]
         if not set(kwargs[filter]).issubset(set(allowed)):
             raise GangaException('%s must be %s' % (filter, allowed))
     
     validateFilter('status', ['open', 'closed'])
     validateFilter('session', ['fastsim', 'fullsim'])
     
     datasets = manager.getDataset(**kwargs)
     dataset = manager.printDatasets(datasets)
     
     self.dataset_id = dataset['dataset_id']
     
     print('\nChosen dataset details:')
     manager.printDatasetDetail(dataset)
     
     print('\nInsert the minimum number of events that you need for your analysis (zero for all):')
     self.events_total = utils.getIndex(maxInclusive=int(dataset['parameters']['evt_tot']))
     
     lfns = self.__getLFNs(dataset['parameters']['evt_file'])
     
     tot_size = 0
     tot_files = len(lfns)
     tot_events = int(dataset['parameters']['evt_file']) * tot_files
     
     for lfn in lfns:
         tot_size += int(lfn['size'])
     
     print('\nTotal job input size: ' + str(utils.sizeof_fmt_binary(tot_size)))
     print('Total selected number of events: ' + str(utils.sizeof_fmt_decimal(tot_events)))
     print('Total number of involved lfns: ' + str(tot_files))
     
     print('\nInsert the maximum number of events for each subjob. Remember:')
     print('- maximum output size is 2GiB.')
     print('- suggested maximum job duration 18h.')
     print('- maximum input size job is 10GiB.')
     print('- at least %s (that is the number of events of one file).' % dataset['parameters']['evt_file'])
     
     self.events_per_subjobs = utils.getIndex(minInclusive=int(dataset['parameters']['evt_file']), maxInclusive=tot_events)
     job = self.__createInputPath(lfns, dataset['parameters']['evt_file'])
     
     print('\nSubjobs details:')
     column_names = ['id', 'list_path', 'size', 'events', 'lfns']
     print(utils.format_dict_table(job, column_names))
Beispiel #4
0
 def __createInputPath(self, lfns, evt_file):
     '''This method splits the list of LFNs between subjobs and writes a 
     text file for each one.'''
     
     evt_file = int(evt_file)
     
     # split all lfns between subjobs
     job = list()
     job.append(list())
     size = 0
     events = 0
     maxInput = 10 * (2**30) # 10GiB
     minInput = 2 * (2**30) # 2GiB
     
     # fill the subjobs al long as there are LFNs,
     # to determine the number of subjobs required 
     for lfn in lfns:
         if (size + int(lfn['size'])) < maxInput and (events + evt_file) <= self.events_per_subjobs:
             size += int(lfn['size'])
             events += evt_file
         else:
             job.append(list())
             size = int(lfn['size'])
             events = evt_file
         
         job[-1].append(lfn)
     
     self.number_of_subjobs = len(job)
     
     # level the number of LFNs between the subjob.
     tot_files = len(lfns)
     balanced_number_lfn_per_subjob = int(math.ceil(float(tot_files)/self.number_of_subjobs))
     job = list()
     self.input_path = list()
     max_size = 0
     jobInputDir = self.getJobObject().inputdir
     lfns_index = 0
     
     for subjob_id in xrange(self.number_of_subjobs):
         subjob = dict()
         size = 0
         events = 0
         number_lfns = 0
         subjob['id'] = str(subjob_id)
         subjob['list_path'] = os.path.join(jobInputDir, "list_%d.txt" % subjob_id)
         
         f = open(subjob['list_path'], 'w')
         try:
             for lfn in lfns[lfns_index:lfns_index + balanced_number_lfn_per_subjob]:
                 f.write(lfn['lfn'] + '\n')
                 size += int(lfn['size'])
                 events += evt_file
                 number_lfns += 1
         finally:
             f.close()
         
         lfns_index += balanced_number_lfn_per_subjob
         self.input_path.append(File(f.name))
         subjob['size'] = utils.sizeof_fmt_binary(size)
         subjob['events'] = utils.sizeof_fmt_decimal(int(events))
         subjob['lfns'] = number_lfns
         job.append(subjob)
         
         if size > max_size:
             max_size = size
     
     if max_size < minInput:
         logger.warning('These subjobs input is very small, to improve the \
         efficiency you could increase the numbers of events per subjob.')
     
     return job
Beispiel #5
0
    def getDataset(self, **kwargs):
        '''Interactive mathod. It prints the datasets (the user can apply filters),
        the user chooses one of them and inserts the number of events he wants.'''

        manager = SBDatasetManager.SBDatasetManager()

        def validateFilter(filter, allowed):
            kwargs[filter] = kwargs.get(filter, allowed)
            if not isinstance(kwargs[filter], list):
                kwargs[filter] = [kwargs[filter]]
            if not set(kwargs[filter]).issubset(set(allowed)):
                raise GangaException('%s must be %s' % (filter, allowed))

        validateFilter('status', ['open', 'closed'])
        validateFilter('session', ['fastsim', 'fullsim'])

        datasets = manager.getDataset(**kwargs)
        dataset = manager.printDatasets(datasets)

        self.dataset_id = dataset['dataset_id']

        print('\nChosen dataset details:')
        manager.printDatasetDetail(dataset)

        print(
            '\nInsert the minimum number of events that you need for your analysis (zero for all):'
        )
        self.events_total = utils.getIndex(
            maxInclusive=int(dataset['parameters']['evt_tot']))

        lfns = self.__getLFNs(dataset['parameters']['evt_file'])

        tot_size = 0
        tot_files = len(lfns)
        tot_events = int(dataset['parameters']['evt_file']) * tot_files

        for lfn in lfns:
            tot_size += int(lfn['size'])

        print('\nTotal job input size: ' +
              str(utils.sizeof_fmt_binary(tot_size)))
        print('Total selected number of events: ' +
              str(utils.sizeof_fmt_decimal(tot_events)))
        print('Total number of involved lfns: ' + str(tot_files))

        print(
            '\nInsert the maximum number of events for each subjob. Remember:')
        print('- maximum output size is 2GiB.')
        print('- suggested maximum job duration 18h.')
        print('- maximum input size job is 10GiB.')
        print('- at least %s (that is the number of events of one file).' %
              dataset['parameters']['evt_file'])

        self.events_per_subjobs = utils.getIndex(minInclusive=int(
            dataset['parameters']['evt_file']),
                                                 maxInclusive=tot_events)
        job = self.__createInputPath(lfns, dataset['parameters']['evt_file'])

        print('\nSubjobs details:')
        column_names = ['id', 'list_path', 'size', 'events', 'lfns']
        print(utils.format_dict_table(job, column_names))
Beispiel #6
0
    def __createInputPath(self, lfns, evt_file):
        '''This method splits the list of LFNs between subjobs and writes a 
        text file for each one.'''

        evt_file = int(evt_file)

        # split all lfns between subjobs
        job = list()
        job.append(list())
        size = 0
        events = 0
        maxInput = 10 * (2**30)  # 10GiB
        minInput = 2 * (2**30)  # 2GiB

        # fill the subjobs al long as there are LFNs,
        # to determine the number of subjobs required
        for lfn in lfns:
            if (size + int(lfn['size'])) < maxInput and (
                    events + evt_file) <= self.events_per_subjobs:
                size += int(lfn['size'])
                events += evt_file
            else:
                job.append(list())
                size = int(lfn['size'])
                events = evt_file

            job[-1].append(lfn)

        self.number_of_subjobs = len(job)

        # level the number of LFNs between the subjob.
        tot_files = len(lfns)
        balanced_number_lfn_per_subjob = int(
            math.ceil(float(tot_files) / self.number_of_subjobs))
        job = list()
        self.input_path = list()
        max_size = 0
        jobInputDir = self.getJobObject().inputdir
        lfns_index = 0

        for subjob_id in xrange(self.number_of_subjobs):
            subjob = dict()
            size = 0
            events = 0
            number_lfns = 0
            subjob['id'] = str(subjob_id)
            subjob['list_path'] = os.path.join(jobInputDir,
                                               "list_%d.txt" % subjob_id)

            f = open(subjob['list_path'], 'w')
            try:
                for lfn in lfns[lfns_index:lfns_index +
                                balanced_number_lfn_per_subjob]:
                    f.write(lfn['lfn'] + '\n')
                    size += int(lfn['size'])
                    events += evt_file
                    number_lfns += 1
            finally:
                f.close()

            lfns_index += balanced_number_lfn_per_subjob
            self.input_path.append(File(f.name))
            subjob['size'] = utils.sizeof_fmt_binary(size)
            subjob['events'] = utils.sizeof_fmt_decimal(int(events))
            subjob['lfns'] = number_lfns
            job.append(subjob)

            if size > max_size:
                max_size = size

        if max_size < minInput:
            logger.warning('These subjobs input is very small, to improve the \
            efficiency you could increase the numbers of events per subjob.')

        return job