def discover_blocks(options): # Get work directory, create dbs dump directory if os.path.isdir(options.args[0]): workDir = os.path.abspath(os.path.normpath(options.args[0])) else: workDir = getConfig(configFile=options.args[0]).getWorkPath() if not options.opts.tempdir: options.opts.tempdir = os.path.join(workDir, 'dbs') if not os.path.exists(options.opts.tempdir): os.mkdir(options.opts.tempdir) # get provider with dataset information if options.opts.input_file: provider = DataProvider.createInstance('ListProvider', getConfig(), options.opts.input_file, None) else: config = getConfig(configDict={'dataset': options.config_dict}) provider = DataProvider.createInstance('DBSInfoProvider', config, options.args[0], None) blocks = provider.getBlocks(show_stats=False) DataProvider.saveToFile(os.path.join(options.opts.tempdir, 'dbs.dat'), blocks) if options.opts.discovery: sys.exit(os.EX_OK) return blocks
def save_dataset(opts, provider): print('') blocks = provider.getBlocks() if opts.ordered: sort_inplace(blocks, key = itemgetter(DataProvider.Dataset, DataProvider.BlockName)) for b in blocks: sort_inplace(b[DataProvider.FileList], key = itemgetter(DataProvider.URL)) DataProvider.saveToFile(opts.save, blocks) print('Dataset information saved to ./%s' % opts.save)
def __init__(self, dataDir, srcName, dataProvider, dataSplitter, dataProc, repository, keepOld=True): LimitedResyncParameterSource.__init__(self) (self._dn, self._name, self._data_provider, self._data_splitter, self._part_proc, self._keepOld) = \ (dataDir, srcName, dataProvider, dataSplitter, dataProc, keepOld) repository['dataset:%s' % srcName] = self self.resyncSetup(interval=-1) if not dataProvider: # debug mode - used by scripts - disables resync self._maxN = self._data_splitter.getMaxJobs() return # look for aborted resyncs - and try to restore old state if possible if self._existsDataPath('cache.dat.resync') and self._existsDataPath( 'map.tar.resync'): utils.renameFile(self._getDataPath('cache.dat.resync'), self._getDataPath('cache.dat')) utils.renameFile(self._getDataPath('map.tar.resync'), self._getDataPath('map.tar')) elif self._existsDataPath('cache.dat.resync') or self._existsDataPath( 'map.tar.resync'): raise DatasetError('Found broken resync state') if self._existsDataPath('cache.dat') and self._existsDataPath( 'map.tar'): self._data_splitter.importPartitions(self._getDataPath('map.tar')) else: DataProvider.saveToFile( self._getDataPath('cache.dat'), self._data_provider.getBlocks(show_stats=False)) self._data_splitter.splitDataset( self._getDataPath('map.tar'), self._data_provider.getBlocks(show_stats=False)) self._maxN = self._data_splitter.getMaxJobs()
def __init__(self, dataDir, srcName, dataProvider, dataSplitter, dataProc, keepOld=True): ParameterSource.__init__(self) (self._dataDir, self._srcName, self._dataProvider, self._dataSplitter, self._part_proc) = \ (dataDir, srcName, dataProvider, dataSplitter, dataProc) if not dataProvider: pass # debug mode - used by scripts - disables resync elif os.path.exists( self.getDataPath('cache.dat') and self.getDataPath('map.tar')): self._dataSplitter.importPartitions(self.getDataPath('map.tar')) else: DataProvider.saveToFile(self.getDataPath('cache.dat'), self._dataProvider.getBlocks(silent=False)) self._dataSplitter.splitDataset(self.getDataPath('map.tar'), self._dataProvider.getBlocks()) self._maxN = self._dataSplitter.getMaxJobs() self._keepOld = keepOld