def getEntries(self, path, metadata, events, seList, objStore): jobNum = metadata['GC_JOBNUM'] tar = tarfile.open(os.path.join(path, 'cmssw.dbs.tar.gz'), 'r') # Collect infos about transferred files fileSummaryMap = {} try: for rawdata in imap(str.split, tar.extractfile('files').readlines()): fileSummaryMap[rawdata[2]] = { 'SE_OUTPUT_HASH_CRC32': rawdata[0], 'SE_OUTPUT_SIZE': int(rawdata[1]) } objStore['CMSSW_FILES'] = fileSummaryMap except Exception: raise DatasetError('Could not read CMSSW file infos for job %d!' % jobNum) # Collect infos about CMSSW processing steps cfgSummaryMap = {} self._processSteps(jobNum, tar, cfgSummaryMap, fileSummaryMap) for cfg in cfgSummaryMap: metadata.setdefault('CMSSW_CONFIG_JOBHASH', []).append( cfgSummaryMap[cfg]['CMSSW_CONFIG_HASH']) objStore.update({ 'CMSSW_CONFIG': cfgSummaryMap, 'CMSSW_FILES': fileSummaryMap }) tar.close() yield (path, metadata, events, seList, objStore)
def create_tarball(match_info_iter, **kwargs): tar = tarfile.open(mode='w:gz', **kwargs) activity = Activity('Generating tarball') for match_info in match_info_iter: if isinstance(match_info, tuple): (path_source, path_target) = match_info else: (path_source, path_target) = (match_info, None) if isinstance(path_source, str): if not os.path.exists(path_source): raise PathError('File %s does not exist!' % path_source) tar.add(path_source, path_target or os.path.basename(path_source), recursive=False) elif path_source is None: # Update activity activity.update('Generating tarball: %s' % path_target) else: # File handle info, handle = path_source.get_tar_info() if path_target: info.name = path_target info.mtime = time.time() info.mode = stat.S_IRUSR + stat.S_IWUSR + stat.S_IRGRP + stat.S_IROTH if info.name.endswith('.sh') or info.name.endswith('.py'): info.mode += stat.S_IXUSR + stat.S_IXGRP + stat.S_IXOTH tar.addfile(info, handle) handle.close() activity.finish() tar.close()
def genTarball(outFile, fileList): tar = tarfile.open(outFile, 'w:gz') activity = None for (pathAbs, pathRel, pathStatus) in fileList: if pathStatus is True: # Existing file tar.add(pathAbs, pathRel, recursive=False) elif pathStatus is False: # Existing file if not os.path.exists(pathAbs): raise UserError('File %s does not exist!' % pathRel) tar.add(pathAbs, pathRel, recursive=False) elif pathStatus is None: # Directory del activity msg = QM( len(pathRel) > 50, pathRel[:15] + '...' + pathRel[-32:], pathRel) activity = ActivityLog('Generating tarball: %s' % msg) else: # File handle info, handle = pathStatus.getTarInfo() info.mtime = time.time() info.mode = stat.S_IRUSR + stat.S_IWUSR + stat.S_IRGRP + stat.S_IROTH if info.name.endswith('.sh') or info.name.endswith('.py'): info.mode += stat.S_IXUSR + stat.S_IXGRP + stat.S_IXOTH tar.addfile(info, handle) handle.close() del activity tar.close()
def _iter_datasource_items(self, item, metadata_dict, entries, location_list, obj_dict): jobnum = metadata_dict['GC_JOBNUM'] cms_log_fn = os.path.join(item, 'cmssw.dbs.tar.gz') if os.path.exists(cms_log_fn): tar = tarfile.open(cms_log_fn, 'r') # Collect infos about transferred files file_summary_map = {} try: file_info_str_list = tar.extractfile('files').readlines() for rawdata in imap(lambda value: bytes2str(value).split(), file_info_str_list): file_summary_map[rawdata[2]] = { 'SE_OUTPUT_HASH_CRC32': rawdata[0], 'SE_OUTPUT_SIZE': int(rawdata[1]) } obj_dict['CMSSW_FILES'] = file_summary_map except Exception: raise DatasetError('Could not read CMSSW file infos for job %d!' % jobnum) # Collect infos about CMSSW processing steps config_summary_map = {} self._process_steps(jobnum, tar, config_summary_map, file_summary_map) for cfg in config_summary_map: job_hash_list = metadata_dict.setdefault('CMSSW_CONFIG_JOBHASH', []) job_hash_list.append(config_summary_map[cfg]['CMSSW_CONFIG_HASH']) obj_dict.update({'CMSSW_CONFIG': config_summary_map, 'CMSSW_FILES': file_summary_map}) tar.close() yield (item, metadata_dict, entries, location_list, obj_dict)
def get_cmssw_info(tar_fn): import xml.dom.minidom # Read framework report files to get number of events cmssw_tar = tarfile.open(tar_fn, 'r:gz') fwk_report_list = ifilter(lambda x: os.path.basename(x.name) == 'report.xml', cmssw_tar.getmembers()) for fwk_report_fn in imap(cmssw_tar.extractfile, fwk_report_list): yield xml.dom.minidom.parse(fwk_report_fn)
def loadSplitting(self, path): try: version = int(tarfile.open(path, 'r:').extractfile('Version').read()) except Exception: version = 1 if version == 1: state = DataSplitterIO_V1().loadSplitting(path) else: state = DataSplitterIO_V2().loadSplitting(path) return state
def getCMSSWInfo(tarPath): import xml.dom.minidom # Read framework report files to get number of events tarFile = tarfile.open(tarPath, 'r:gz') fwkReports = ifilter(lambda x: os.path.basename(x.name) == 'report.xml', tarFile.getmembers()) for fwkReport in imap(tarFile.extractfile, fwkReports): try: yield xml.dom.minidom.parse(fwkReport) except Exception: logging.exception('Error while parsing %s', tarPath) raise
def _getPartition(self, key): if not self._cacheKey == key / 100: self._cacheKey = key / 100 subTarFileObj = self._tar.extractfile('%03dXX.tgz' % (key / 100)) subTarFileObj = BytesBuffer(gzip.GzipFile(fileobj = subTarFileObj).read()) # 3-4x speedup for sequential access self._cacheTar = tarfile.open(mode = 'r', fileobj = subTarFileObj) data = self._fmt.parse(self._cacheTar.extractfile('%05d/info' % key).readlines(), keyParser = {None: int}, valueParser = self._parserMap) fileList = lmap(bytes2str, self._cacheTar.extractfile('%05d/list' % key).readlines()) if DataSplitter.CommonPrefix in data: fileList = imap(lambda x: '%s/%s' % (data[DataSplitter.CommonPrefix], x), fileList) data[DataSplitter.FileList] = lmap(str.strip, fileList) return data
def getEntries(self, path, metadata, events, seList, objStore): jobNum = metadata['GC_JOBNUM'] tar = tarfile.open(os.path.join(path, 'cmssw.dbs.tar.gz'), 'r') # Collect infos about transferred files fileSummaryMap = {} try: for rawdata in imap(str.split, tar.extractfile('files').readlines()): fileSummaryMap[rawdata[2]] = {'SE_OUTPUT_HASH_CRC32': rawdata[0], 'SE_OUTPUT_SIZE': int(rawdata[1])} objStore['CMSSW_FILES'] = fileSummaryMap except Exception: raise DatasetError('Could not read CMSSW file infos for job %d!' % jobNum) # Collect infos about CMSSW processing steps cfgSummaryMap = {} self._processSteps(jobNum, tar, cfgSummaryMap, fileSummaryMap) for cfg in cfgSummaryMap: metadata.setdefault('CMSSW_CONFIG_JOBHASH', []).append(cfgSummaryMap[cfg]['CMSSW_CONFIG_HASH']) objStore.update({'CMSSW_CONFIG': cfgSummaryMap, 'CMSSW_FILES': fileSummaryMap}) tar.close() yield (path, metadata, events, seList, objStore)
def __init__(self, path): activity = Activity('Reading dataset partition file') self._lock = GCLock() self._fmt = utils.DictFormat() self._tar = tarfile.open(path, 'r:') (self._cacheKey, self._cacheTar) = (None, None) metadata = self._fmt.parse(self._tar.extractfile('Metadata').readlines(), keyParser = {None: str}) self.maxJobs = metadata.pop('MaxJobs') self.classname = metadata.pop('ClassName') self.metadata = {'dataset': dict(ifilter(lambda k_v: not k_v[0].startswith('['), metadata.items()))} for (k, v) in ifilter(lambda k_v: k_v[0].startswith('['), metadata.items()): self.metadata.setdefault('dataset %s' % k.split(']')[0].lstrip('['), {})[k.split(']')[1].strip()] = v activity.finish() self._parserMap = { None: str, DataSplitter.NEntries: int, DataSplitter.Skipped: int, DataSplitter.DatasetID: int, DataSplitter.Invalid: parseBool, DataSplitter.Locations: lambda x: parseList(x, ','), DataSplitter.MetadataHeader: parseJSON, DataSplitter.Metadata: lambda x: parseJSON(x.strip("'")) }
def genTarball(outFile, fileList): tar = tarfile.open(outFile, 'w:gz') activity = Activity('Generating tarball') for (pathAbs, pathRel, pathStatus) in fileList: if pathStatus is True: # Existing file tar.add(pathAbs, pathRel, recursive = False) elif pathStatus is False: # Existing file if not os.path.exists(pathAbs): raise UserError('File %s does not exist!' % pathRel) tar.add(pathAbs, pathRel, recursive = False) elif pathStatus is None: # Directory activity.update('Generating tarball: %s' % pathRel) else: # File handle info, handle = pathStatus.getTarInfo() info.mtime = time.time() info.mode = stat.S_IRUSR + stat.S_IWUSR + stat.S_IRGRP + stat.S_IROTH if info.name.endswith('.sh') or info.name.endswith('.py'): info.mode += stat.S_IXUSR + stat.S_IXGRP + stat.S_IXOTH tar.addfile(info, handle) handle.close() activity.finish() tar.close()
def __init__(self, path): activity = Activity('Reading dataset partition file') self._fmt = DictFormat() try: self._tar = tarfile.open(path, 'r:') metadata = self._fmt.parse(self._tar.extractfile('Metadata').readlines(), key_parser={None: str}) FilePartitionReader.__init__(self, path, metadata.pop('MaxJobs')) self._metadata = metadata activity.finish() except Exception: raise PartitionReaderError('No valid dataset splitting found in %s' % path) self._map_enum2parser = { None: str, DataSplitter.NEntries: int, DataSplitter.Skipped: int, DataSplitter.Invalid: parse_bool, DataSplitter.Locations: lambda x: parse_list(x, ','), DataSplitter.MetadataHeader: parse_json, DataSplitter.Metadata: lambda x: parse_json(x.strip("'")) } (self._cache_nested_fn, self._cache_nested_tar) = (None, None)
def _open_nested_tar(self, nested_fn): nested_tar_fp = self._tar.extractfile(nested_fn) nested_tar_fp = BytesBuffer(gzip.GzipFile(fileobj=nested_tar_fp).read()) return tarfile.open(mode='r', fileobj=nested_tar_fp)
def save_partitions(self, path, partition_iter, progress=None): outer_tar = tarfile.open(path, 'w:') self._save_partitions(outer_tar, partition_iter, progress) outer_tar.close()
def _createSubTar(self, subTarFileName): subTarFileObj = BytesBuffer() subTarFile = tarfile.open(mode = 'w:gz', fileobj = subTarFileObj) return (subTarFile, subTarFileObj, subTarFileName)
def _create_nested_tar(self, fn): nested_tar_fp = BytesBuffer() nested_tar = tarfile.open(mode='w:gz', fileobj=nested_tar_fp) nested_tar.nested_tar_fp = nested_tar_fp nested_tar.nested_fn = fn return nested_tar
def saveSplitting(self, path, meta, source, sourceLen, message = 'Writing job mapping file'): tar = tarfile.open(path, 'w:') self._saveStateToTar(tar, meta, source, sourceLen, message) tar.close()
def __new__(cls, path): version = ignore_exception(Exception, 1, lambda: int(tarfile.open(path, 'r:').extractfile('Version').read())) return FilePartitionReader.create_instance('version_%s' % version, path)