def extract_docs_from_tarball(self, category: str, doc_name: str, tmp_tarball_fp: str): """ Extracts the tarball into a static html folder for serving in the application and adds the doc to the registry. Args: category (str): Category of the incoming document tarball. doc_name (str): Document name of the incoming tarball. tmp_tarball_fp (str): File path to the tarball to store and extract. """ logger.info("Extracting document: {doc_filename} with category: " "{doc_category} from tarball.".format(doc_filename=doc_name, doc_category=category)) doc_category_dir = os.path.join('docserver', 'static', category) this_doc_dir = os.path.join(doc_category_dir, doc_name) flask_doc_path = os.path.join('static', category, doc_name) # Create category directory if it doesn't exist. if not os.path.exists(doc_category_dir): os.mkdir(doc_category_dir) # Remove previous document html if it's already there. if os.path.exists(this_doc_dir): rmtree(this_doc_dir, ignore_errors=True) # TODO: Add check to make sure extracted tarball folder has the same name as doc_name. # TODO: Add check to make sure index.html exists. with tarfile.open(tmp_tarball_fp, mode="r:gz") as tar: tar.extractall(path=doc_category_dir) self.registry.set(name=category + "_" + doc_name, value=flask_doc_path)
def home_post(): if 'file' not in request.files: return redirect(request.url) doc_obj = request.files['file'] if doc_obj: try: logger.info( 'Attempting to save {filename} document tarball.'.format( filename=doc_obj.filename)) doc_id = doc_obj.filename.replace(".tar.gz", "") _, doc_name = doc_id.split('_', 1) doc_obj.seek(0) with TemporaryDirectory() as tmp_dir: tmp_tarball_fp = os.path.join(tmp_dir, doc_name + ".tar.gz") with open(tmp_tarball_fp, 'wb') as tmp_tarball_file: copyfileobj(doc_obj, tmp_tarball_file, length=16384) doc_storage[doc_id] = tmp_tarball_fp except Exception as e: log_exception(raised_exception=e) abort( ERROR_CODE, 'Something failed with uploading, storing, or extracting your document tarball.' ) else: msg = "Document: {doc_name} was correctly uploaded, stored, and extracted.\n".format( doc_name=doc_name) return msg, 201 return abort(400, 'You must upload a tarball file to use the POST endpoint.')
def store_tarball(self, category: str, doc_name: str, tmp_tarball_fp: str): """Stores the tarball in the folder specified by ``category`` in the S3 bucket. Args: category (str): Category of the incoming document tarball. doc_name (str): Document name of the incoming tarball. tmp_tarball_fp (str): File path to the tarball to store. """ logger.info("Storing document: {doc_filename} with category: " "{doc_category} in S3 bucket.".format(doc_filename=doc_name, doc_category=category)) s3_tarball_loc = os.path.join(self.s3_folder, category, doc_name + ".tar.gz") self.bucket.upload_file(tmp_tarball_fp, s3_tarball_loc)
def initialize_storage(self): """Pulls down any previously stored tarballs from S3 and initializes the static html for each.""" logger.info("Initializing S3 storage, pulling down any docs from S3 if they exist.") available_docs = [obj.key for obj in self.bucket.objects.filter(Prefix=os.path.join(self.s3_folder))] for document_key in available_docs: # Only want to download non-directories. *_, doc_category, doc_filename = document_key.split("/") if doc_filename != '': logger.info("Downloading document: {doc_filename} with category: " "{doc_category} from S3.".format(doc_filename=doc_filename, doc_category=doc_category)) with TemporaryDirectory() as tmp_dir: target_path = os.path.join(tmp_dir, doc_filename) self.bucket.download_file(document_key, target_path) self.extract_docs_from_tarball(category=doc_category, doc_name=doc_filename.replace(".tar.gz", ""), tmp_tarball_fp=target_path)
def store_tarball(self, category: str, doc_name: str, tmp_tarball_fp: str): """Stores an incoming tarball locally in ``tarball_dir``. Args: category (str): Category of the incoming document tarball. doc_name (str): Document name of the incoming tarball. tmp_tarball_fp (str): File path to the tarball to store. """ logger.info("Storing document: {doc_filename} with category: " "{doc_category} locally.".format(doc_filename=doc_name, doc_category=category)) tarball_category_dir = os.path.join(self.tarball_dir, category) dest_tarball_fp = os.path.join(tarball_category_dir, doc_name + ".tar.gz") # Create category directory if it doesn't exist. if not os.path.exists(tarball_category_dir): os.mkdir(tarball_category_dir) # Remove previous tarball if it's already there. if os.path.exists(dest_tarball_fp): rmtree(dest_tarball_fp, ignore_errors=True) copyfile(tmp_tarball_fp, dest_tarball_fp)