def process(self, block_iter): (self._limit_files_per_ds, self._files_per_ds) = ({}, {} ) # reset counters if self._limit_files_fraction >= 0: block_list = list(DataProcessor.process(self, block_iter)) goal_per_ds = {} # calculate file limit per dataset for (dataset_name, fn_list_len) in self._files_per_ds.items(): goal_per_ds[dataset_name] = int( self._limit_files_fraction * fn_list_len) or 1 for block in block_list: self._reduce_fn_list(block, goal_per_ds) yield block else: for block in DataProcessor.process(self, block_iter): yield block
def process(self, block_iter): if self.enabled() and self._config: block_list = list(DataProcessor.process(self, block_iter)) if (self._target_jobs > 0) or (self._target_jobs_ds > 0): self._set_split_opt(self._config, 'files per job', dict(self._files), self._target_jobs, self._target_jobs_ds) self._set_split_opt(self._config, 'events per job', dict(self._entries), self._target_jobs, self._target_jobs_ds) self._config = None return block_list return block_iter
def process(self, blockIter): self._recordedURL = set() self._recordedBlock = set() return DataProcessor.process(self, blockIter)
def process(self, block_iter): self._recorded_url = set() # reset records self._recorded_block = set() return DataProcessor.process(self, block_iter)