def extract(context, data): """Extract a compressed file""" with context.http.rehash(data) as result: file_path = result.file_path content_type = result.content_type extract_dir = random_filename(context.work_path) if content_type in ZIP_MIME_TYPES: extracted_files = extract_zip(file_path, extract_dir, context) elif content_type in TAR_MIME_TYPES: extracted_files = extract_tar(file_path, extract_dir, context) elif content_type in SEVENZIP_MIME_TYPES: extracted_files = extract_7zip(file_path, extract_dir, context) else: context.log.warning( "Unsupported archive content type: %s", content_type ) return extracted_content_hashes = {} for path in extracted_files: relative_path = os.path.relpath(path, extract_dir) content_hash = context.store_file(path) extracted_content_hashes[relative_path] = content_hash data['content_hash'] = content_hash data['file_name'] = relative_path context.emit(data=data.copy())
def extract(context, data): """ Extract a compressed file optional params in context: wildcards: only store extracted files matching these shell-style wildcards """ with context.http.rehash(data) as result: file_path = result.file_path content_type = result.content_type extract_dir = random_filename(context.work_path) if content_type in ZIP_MIME_TYPES: extracted_files = extract_zip(file_path, extract_dir, context) elif content_type in TAR_MIME_TYPES: extracted_files = extract_tar(file_path, extract_dir, context) elif content_type in SEVENZIP_MIME_TYPES: extracted_files = extract_7zip(file_path, extract_dir, context) else: context.log.warning("Unsupported archive content type: %s", content_type) return wildcards = ensure_list(context.params.get("wildcards")) or None for path in extracted_files: if wildcards is None or _test_fname(wildcards, path): relative_path = os.path.relpath(path, extract_dir) content_hash = context.store_file(path) data["content_hash"] = content_hash data["file_name"] = relative_path context.emit(data=data.copy())
def store_data(self, data, encoding='utf-8'): """Put the given content into a file, possibly encoding it as UTF-8 in the process.""" path = random_filename(self.work_path) try: with open(path, 'wb') as fh: if isinstance(data, str): data = data.encode(encoding) if data is not None: fh.write(data) return self.store_file(path) finally: try: os.unlink(path) except OSError: pass
def fetch(self): """Lazily trigger download of the data when requested.""" if self._file_path is not None: return self._file_path temp_path = self.context.work_path if self._content_hash is not None: self._file_path = storage.load_file(self._content_hash, temp_path=temp_path) return self._file_path if self.response is not None: self._file_path = random_filename(temp_path) content_hash = sha1() with open(self._file_path, 'wb') as fh: for chunk in self.response.iter_content(chunk_size=8192): content_hash.update(chunk) fh.write(chunk) self._remove_file = True chash = content_hash.hexdigest() self._content_hash = storage.archive_file(self._file_path, content_hash=chash) if self.http.cache and self.ok: self.context.set_tag(self.request_id, self.serialize()) self.retrieved_at = datetime.utcnow().isoformat() return self._file_path