def _clean_info( root: Optional[str], epoch: int, info: tarfile.TarInfo ) -> tarfile.TarInfo: """ Remove variable data from an archive entry. :param root: absolute path to the root directory from which the entry was added, or None to disable turning the name into a relative path :param epoch: fixed modification time to set :param info: tarinfo object to set :returns: changed tarinfo """ if root is not None: info.name = os.path.relpath("/" + info.name, root) if not info.name.startswith("."): info.name = "./" + info.name info.uid = 0 info.gid = 0 info.uname = "" info.gname = "" info.mtime = epoch return info
def stream_regular_file(self, filepath, tarinfo_buf, file_info): try: file_system = files.get_fs_by_file_path(filepath) with file_system.open(filepath, 'rb') as fd: f_iter = iter(lambda: fd.read(self.CHUNKSIZE), '') # pylint: disable=cell-var-from-loop try: yield tarinfo_buf chunk = '' for chunk in f_iter: yield chunk if len(chunk) % self.BLOCKSIZE != 0: yield (self.BLOCKSIZE - (len(chunk) % self.BLOCKSIZE)) * b'\0' except (IOError, fs.errors.OperationFailed): msg = ( "Error happened during sending file content in archive stream, file path: %s, " "container: %s/%s, archive path: %s" % file_info) self.log.critical(msg) self.abort(500, msg) except (fs.errors.ResourceNotFound, fs.errors.OperationFailed, IOError): self.log.critical( "Couldn't find the file during creating archive stream: %s, " "container: %s/%s, archive path: %s" % file_info) tarinfo = TarInfo() tarinfo.name = file_info[3] + '.MISSING' yield tarinfo.tobuf()
def _test_roundtrip(self, context): path = context.given_file() content = b'test content' filename = 'archived-file.txt' with xtarfile_open(path, context.mode('w')) as archive: buffer = BytesIO() buffer.write(content) buffer.seek(0) tarinfo = TarInfo() tarinfo.size = len(content) tarinfo.name = filename archive.addfile(tarinfo, buffer) with xtarfile_open(path, context.mode('r')) as archive: while True: member = archive.next() if member is None: self.fail('{} not found in archive'.format(filename)) if member.name == filename: buffer = archive.extractfile(member) actual_content = buffer.read() break self.assertEqual(actual_content, content)
def _unpack_info_file(self, tar: TarFile, member: TarInfo, fileobj: io.BytesIO): directory = Path("var", "lib", "dpkg", "info").as_posix() name = member.name.lstrip("./") member.name = f"./{directory}/{self.package.name}.{name}" tar.addfile(member, fileobj)
def create_article(self, data, info=None): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) if info is None: info = TarInfo() assert isinstance(info, TarInfo) aid = '%08x' % self._artdb.nextrecno() info.name = aid+info.name tid = self._add_corpus(info, data) assert aid == tid self._artdb.add_record(tid) return aid
def create_article(self, data, info=None): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) if info is None: info = TarInfo() assert isinstance(info, TarInfo) aid = '%08x' % self._artdb.nextrecno() info.name = aid + info.name tid = self._add_corpus(info, data) assert aid == tid self._artdb.add_record(tid) return aid
def _addMember(path, data, modtime): from tarfile import DIRTYPE elements = path.split('/') parents = filter(None, [elements[x] for x in range(len(elements))]) for parent in parents: info = TarInfo() info.name = parent info.size = 0 info.mtime = mod_time info.type = DIRTYPE archive.addfile(info, StringIO()) _addOneMember(path, data, modtime)
def compute(self, conn, data=None): tarinfo = TarInfo() tarinfo.name = self.name tarinfo.mod = 0o700 tarinfo.uid = 0 tarinfo.gid = 0 tarinfo.type = REGTYPE tarinfo.linkname = "" if self.name == CONTAINER_PROPERTIES: meta = data or conn.container_get_properties(self.acct, self.ref) tarinfo.size = len(json.dumps(meta['properties'], sort_keys=True)) self._filesize = tarinfo.size self._buf = tarinfo.tobuf(format=PAX_FORMAT) return elif self.name == CONTAINER_MANIFEST: tarinfo.size = len(json.dumps(data, sort_keys=True)) self._filesize = tarinfo.size self._buf = tarinfo.tobuf(format=PAX_FORMAT) return entry = conn.object_get_properties(self.acct, self.ref, self.name) properties = entry['properties'] # x-static-large-object if properties.get(SLO, False): tarinfo.size = int(properties.get(SLO_SIZE)) _, slo = conn.object_fetch(self.acct, self.ref, self.name) self._slo = json.loads("".join(slo), object_pairs_hook=OrderedDict) else: tarinfo.size = int(entry['length']) self._filesize = tarinfo.size # XATTR # do we have to store basic properties like policy, ... ? for key, val in properties.items(): assert isinstance(val, basestring), \ "Invalid type for %s:%s:%s" % (self.acct, self.name, key) if self.slo and key in SLO_HEADERS: continue tarinfo.pax_headers[SCHILY + key] = val tarinfo.pax_headers['mime_type'] = entry['mime_type'] self._buf = tarinfo.tobuf(format=PAX_FORMAT)
def archivestream(self, ticket): stream = cStringIO.StringIO() with tarfile.open(mode='w|', fileobj=stream): for filepath, arcpath, cont_name, cont_id, f_size, f_modified in ticket[ 'target']: tarinfo = TarInfo() tarinfo.name = arcpath.lstrip('/') tarinfo.size = f_size tarinfo.mtime = datetime_to_epoch(f_modified) tarinfo_buf = tarinfo.tobuf() signed_url = None try: signed_url = files.get_signed_url(filepath, config.fs) except fs.errors.ResourceNotFound: pass if signed_url: content_generator = self.stream_file_signed_url( signed_url, tarinfo_buf, (filepath, cont_name, cont_id, arcpath)) else: content_generator = self.stream_regular_file( filepath, tarinfo_buf, (filepath, cont_name, cont_id, arcpath)) for chunk in content_generator: yield chunk self.log_user_access( AccessType.download_file, cont_name=cont_name, cont_id=cont_id, filename=os.path.basename(arcpath), origin_override=ticket['origin'], download_ticket=ticket['_id']) # log download yield stream.getvalue() # get tar stream trailer stream.close()
def compute(self, conn, data=None): tarinfo = TarInfo() tarinfo.name = self.name tarinfo.mod = 0o700 tarinfo.uid = 0 tarinfo.gid = 0 tarinfo.type = REGTYPE tarinfo.linkname = "" if self.name == CONTAINER_PROPERTIES: meta = data or conn.container_get_properties(self.acct, self.ref) tarinfo.size = len(json.dumps(meta['properties'], sort_keys=True)) self._filesize = tarinfo.size self._buf = tarinfo.tobuf(format=PAX_FORMAT) return elif self.name == CONTAINER_MANIFEST: tarinfo.size = len(json.dumps(data, sort_keys=True)) self._filesize = tarinfo.size self._buf = tarinfo.tobuf(format=PAX_FORMAT) return entry = conn.object_get_properties(self.acct, self.ref, self.name) properties = entry['properties'] # x-static-large-object if properties.get(SLO, False): tarinfo.size = int(properties.get(SLO_SIZE)) _, slo = conn.object_fetch(self.acct, self.ref, self.name, properties=False) self._slo = json.loads("".join(slo), object_pairs_hook=OrderedDict) self._checksums = {} # format MD5 to share same format as multi chunks object offset = 0 for idx, ck in enumerate(self._slo): self._checksums[idx] = { 'hash': ck['hash'].upper(), 'size': ck['bytes'], 'offset': offset } offset += ck['bytes'] else: tarinfo.size = int(entry['length']) meta, chunks = conn.object_locate(self.acct, self.ref, self.name, properties=False) storage_method = STORAGE_METHODS.load(meta['chunk_method']) chunks = _sort_chunks(chunks, storage_method.ec) for idx in chunks: chunks[idx] = chunks[idx][0] del chunks[idx]['url'] del chunks[idx]['score'] del chunks[idx]['pos'] self._checksums = chunks self._filesize = tarinfo.size # XATTR # do we have to store basic properties like policy, ... ? for key, val in properties.items(): assert isinstance(val, basestring), \ "Invalid type for %s:%s:%s" % (self.acct, self.name, key) if self.slo and key in SLO_HEADERS: continue tarinfo.pax_headers[SCHILY + key] = val tarinfo.pax_headers['mime_type'] = entry['mime_type'] self._buf = tarinfo.tobuf(format=PAX_FORMAT)
def write_tar(archive_url, manifest_path, tar_path, strip_prefix=None, progress_bar=False, overwrite=False): """ Write all objects from archive_url to tar_path. Write list of objects to manifest_path. """ if not overwrite: if exists(tar_path): raise IOError("%s already exists." % tar_path) if exists(manifest_path): raise IOError("%s already exists." % manifest_path) # get iterator of items to tar, and check that it includes at least one item objects = list_objects(archive_url) try: _, objects = peek(iter(objects)) except StopIteration: raise IOError("No objects found at %s" % archive_url) # write tar make_parent_dir(tar_path) files_written = [] with open(tar_path, 'wb', ignore_ext=True) as tar_out, \ LoggingTarFile.open(fileobj=tar_out, mode='w|') as tar, \ TemporaryDirectory() as temp_dir: # load object contents in background threads items = threaded_queue(load_object, ((obj, temp_dir) for obj in objects)) # tar each item for obj, response, body in tqdm(items, disable=not progress_bar): body = HashingFile(body) tar_info = TarInfo() tar_info.size = int(response['ContentLength']) tar_info.mtime = response['LastModified'].timestamp() tar_info.name = obj.key if strip_prefix and tar_info.name.startswith(strip_prefix): tar_info.name = tar_info.name[len(strip_prefix):] tar.addfile(tar_info, body) member = tar.members[-1] files_written.append( OrderedDict(( # inventory fields ('Bucket', obj.bucket_name), ('Key', obj.key), ('Size', response['ContentLength']), ('LastModifiedDate', response['LastModified'].isoformat()), ('ETag', response['ETag'].strip('"')), ('StorageClass', response.get('StorageClass', 'STANDARD')), ('VersionId', response.get('VersionId', '')), # ('Owner', obj.owner['DisplayName'] if obj.owner else ''), # tar fields ('TarMD5', body.hexdigest()), ('TarOffset', member.offset), ('TarDataOffset', member.offset_data), ('TarSize', member.size), ) + ((('TarStrippedPrefix', strip_prefix), ) if strip_prefix else tuple()))) if response['ContentLength'] != member.size: raise ValueError("Object size mismatch: %s" % obj.key) # write csv make_parent_dir(manifest_path) files_written.sort(key=lambda f: f['Key']) write_dicts_to_csv(manifest_path, files_written)
def strip_prefix(self, prefix: str, member: tarfile.TarInfo): member.name = self.strip_slash(prefix, member.name) # Strip hardlinks if member.islnk() and not member.issym(): member.linkname = self.strip_slash(prefix, member.linkname)
def filter1(tarinfo: tarfile.TarInfo): tarinfo.name = tarinfo.name[len(path):] return tarinfo