def _unpack(self, filepath): """Determines the vim script's extension and unpacks it. Sets the files variable. Removes the archive file and temp dir. """ import os.path root, ext = os.path.splitext(filepath) if ext == u'.zip': from zipfile import ZipFile archive = ZipFile(filepath, 'r') self.files = [member.filename for member in archive.infolist() if not member.filename[-1] == '/'] archive.extractall('/home/chris/.vim') elif ext == u'.tar' or ext == u'.tgz' or ext == u'.bz2': import tarfile archive = tarfile.open(filepath) self.files = [member.name for member in archive.getmembers() if member.isfile()] archive.extractall('/home/chris/.vim') elif (ext == u'.gz' or ext == u'.bz2') and (os.path.splitext(root)[1] == u'.tar'): import tarfile archive = tarfile.open(filepath) self.files = [member.name for member in archive.getmembers() if member.isfile()] archive.extractall('/home/chris/.vim') elif ext == u'.vba': vimExecute(':so %\n:q\n') self.files=[filepath] # Cleanup print "Deleting {0} and {1}".format(filepath, os.path.dirname(filepath)) os.unlink(filepath) os.rmdir(os.path.dirname(filepath))
def extract_archive(archive_path, extract_to, exclude=None): """ Extract archive (.tar, .zip) :param archive_path: Path to archive :type archive_path: String|pathlib.Path :param extract_to: Path to extraction :type extract_to: String|pathlib.Path :param exclude: Patterns for files and directories that should not be extracted (just path sub-strings with no wildcards or regexp) :type exclude: List """ archive_path = pathlib.Path(archive_path) extract_to = pathlib.Path(extract_to) if archive_path.suffix == '.tar': package = tarfile.open(str(archive_path), 'r') elif archive_path.suffix == '.gz': package = tarfile.open(str(archive_path), 'r:gz') elif archive_path.suffix == '.zip': package = ZipFile(str(archive_path)) else: raise UnsupportedArchiveError( f"Unsupported archive extension {archive_path.suffix}") if isinstance(package, ZipFile): package_data = package.infolist() is_zip = True else: package_data = package.getmembers() is_zip = False data_to_extract = None if exclude and isinstance(exclude, list): data_to_extract = [] for member in package_data: is_excluded = False for pattern in exclude: member_path = member.filename if is_zip else member.name if pattern in member_path: is_excluded = True break if not is_excluded: data_to_extract.append(member) package.extractall(extract_to, members=data_to_extract) package.close()
def download(self, input_data_url, output_data_path): file_name = input_data_url.split('/')[-1] output_file_path = os.path.join(output_data_path,file_name) if not os.path.exists(output_file_path): # Creating output directory if it does not exist if not os.path.exists(output_data_path): os.makedirs(output_data_path) # Downloading the data file file_path, _ = urlretrieve(url=input_data_url, filename=output_file_path, reporthook=self.show_download_progress) print('\nDownload completed') else: print('File already downloaded') extracted_archive_path = None if output_file_path.endswith('.zip'): # Extracting zip archive archive = ZipFile(file=output_file_path,mode='r') archive.extractall(output_data_path) extracted_archive_path = os.path.join(output_data_path,archive.infolist()[0].filename) print('Archive unpacked') elif output_file_path.endswith(('.tar.gz','.tgz')): # Extracting tar-ball archive = tarfile.open(name=output_file_path,mode='r:gz') archive.extractall(output_data_path) extracted_archive_path = os.path.join(output_data_path,archive.getmembers()[0].name) print('Archive unpacked') else: print('Archive file format not supported') return extracted_archive_path
def clean_dataset_file(self): """Perform the cleaning of the dataset_file (the archive file specified by the user). This method returns information about the files in the archive in form a 2-tuple (filesize, stream). The returned list will be accessible from the cleaned_data directory. """ # First try to unzip the archive try: archive = ZipFile(self.cleaned_data["dataset_file"], allowZip64=True) def get_filestream(filename): f = archive.open(filename) size = archive.getinfo(filename).file_size streamable = six.BytesIO(f.read()) streamable.name = f.name.replace('/', '_') # streamable.name = f.name.split('/')[-1] f.close() return size, streamable def should_include(filename): # Bail on directories if filename.endswith("/"): return False # And on hidden files if filename.split("/")[-1].startswith("."): return False return True files = [ get_filestream(file_name) for file_name in archive.namelist() if should_include(file_name) ] except zipfile.BadZipfile: # Bad zip? Try tar why not try: self.cleaned_data["dataset_file"].seek( 0) # Reset the file so we can read it again archive = tarfile.open( name=None, mode='r', fileobj=self.cleaned_data["dataset_file"]) def get_filestream(archive_member): xfile = archive.extractfile(archive_member) if xfile is not None: xfile.name = xfile.name[2:].replace('/', '_') return xfile.size, xfile else: return 0, None def should_include(name): name = name[2:] if name.endswith("/"): return False # And on hidden files if name.split("/")[-1].startswith("."): return False return True files = [ get_filestream(member) for member in archive.getmembers() if should_include(member.name) ] files = filter(lambda x: x is not None, files) except tarfile.TarError: raise forms.ValidationError(_( 'Not a valid archive file. We currently accept Zip and Tar files.' ), code='invalid') return files
def handle_file(file_path, module_filter, only_detect, is_temp_file=False): # todo modular archive handling # todo PE overlay extraction # todo PE resources extraction # todo Installer extraction if is_zipfile(file_path): # extract each file and handle it # todo consider adding archive password support try: z = ZipFile(file_path) for n in z.namelist(): data = z.read(n) new_path = write_file_to_temp_file(data) for p, r in handle_file(new_path, module_filter, only_detect, is_temp_file=True): result_path = "" if is_temp_file: result_path = n else: result_path = file_path + "," + n if p is not None: result_path += "," + p yield result_path, r remove(new_path) except KeyboardInterrupt: raise except: pass elif tarfile.is_tarfile(file_path): try: with tarfile.open(file_path, 'r') as z: for member in z.getmembers(): try: data = z.extractfile(member).read() n = member.name new_path = write_file_to_temp_file(data) for p, r in handle_file(new_path, module_filter, only_detect, is_temp_file=True): result_path = "" if is_temp_file: result_path = n else: result_path = file_path + "," + n if p is not None: result_path += "," + p yield result_path, r remove(new_path) except KeyboardInterrupt: raise except: pass except KeyboardInterrupt: raise except: pass elif is_rarfile(file_path): try: z = RarFile(file_path) for n in z.namelist(): data = z.read(n) new_path = write_file_to_temp_file(data) for p, r in handle_file(new_path, module_filter, only_detect, is_temp_file=True): result_path = "" if is_temp_file: result_path = n else: result_path = file_path + "," + n if p is not None: result_path += "," + p yield result_path, r remove(new_path) except KeyboardInterrupt: raise except: pass else: # assume we are dealing with a normal file # todo Convert file handling to use file paths if getsize(file_path) < 1024 * 1024 * 1024: with open(file_path, mode='rb') as file_handle: file_content = file_handle.read() r = scan_file_data(file_content, module_filter, only_detect) if r is not None: if is_temp_file: yield None, r else: yield file_path, r
def clean_dataset_file(self): """Perform the cleaning of the dataset_file (the archive file specified by the user). This method returns information about the files in the archive in form a 2-tuple (filesize, stream). The returned list will be accessible from the cleaned_data directory. """ # First try to unzip the archive try: archive = ZipFile(self.cleaned_data["dataset_file"], allowZip64=True) def get_filestream(filename): f = archive.open(filename) size = archive.getinfo(filename).file_size streamable = six.BytesIO(f.read()) streamable.name = f.name.replace('/', '_') # streamable.name = f.name.split('/')[-1] f.close() return size, streamable def should_include(filename): # Bail on directories if filename.endswith("/"): return False # And on hidden files if filename.split("/")[-1].startswith("."): return False return True files = [get_filestream(file_name) for file_name in archive.namelist() if should_include(file_name)] except zipfile.BadZipfile: # Bad zip? Try tar why not try: self.cleaned_data["dataset_file"].seek(0) # Reset the file so we can read it again archive = tarfile.open(name=None, mode='r', fileobj=self.cleaned_data["dataset_file"]) def get_filestream(archive_member): xfile = archive.extractfile(archive_member) if xfile is not None: xfile.name = xfile.name[2:].replace('/', '_') return xfile.size, xfile else: return 0, None def should_include(name): name = name[2:] if name.endswith("/"): return False # And on hidden files if name.split("/")[-1].startswith("."): return False return True files = [get_filestream(member) for member in archive.getmembers() if should_include(member.name)] files = filter(lambda x: x is not None, files) except tarfile.TarError: raise forms.ValidationError(_('Not a valid archive file. We currently accept Zip and Tar files.'), code='invalid') return files
def loadZip(self, filePaths): """ Compatible with zip, tar and gz extensions Read only Red, Near IR and Thermal IR bands Get satellite type by looking at the name of the metadata file. """ filepath = filePaths["zip"] recognised = False bands = {"Error": None} for ext in [".tar.gz", ".tar", ".zip", ".gz"]: if filepath.lower().endswith(ext): recognised = True if not (recognised): bands["Error"] = "Unknown compressed file format" return bands self.folder = filepath[:filepath.rfind("/")] if filepath.lower().endswith(".zip"): compressed = ZipFile(filepath, "r") extract = compressed.extract listoffiles = compressed.namelist() elif filepath.lower().endswith(".gz"): compressed = tarfile.open(filepath, "r:gz") extract = compressed.extract listoffiles = [member.name for member in compressed.getmembers()] else: compressed = tarfile.open(filepath, "r") extract = compressed.extract listoffiles = compressed.getmembers() for filename in listoffiles: if filename.upper().endswith("MTL.TXT"): if filename[:4] == "LC08": bands["sat_type"] = "Landsat8" sat_type = "Landsat8" if filename[:4] == "LT05": bands["sat_type"] = "Landsat5" sat_type = "Landsat5" if "sat_type" not in bands: bands[ "Error"] = "Unknown satellite - Please verify that files have not been renamed" compressed.close() return bands sat_bands = { "Landsat5": { "Red": "B3", "Near-IR": "B4", "Thermal-IR": "B6" }, "Landsat8": { "Red": "B4", "Near-IR": "B5", "Thermal-IR": "B10" }, } shapefile = None if "Shape" in filePaths: shapefile = filePaths["Shape"] filePaths = dict() for band in ("Red", "Near-IR", "Thermal-IR"): bands[band] = np.array([]) for filename in listoffiles: if filename.upper().endswith(sat_bands[sat_type][band] + ".TIF"): extract(filename) filePaths[band] = filename compressed.close() for band in ("Red", "Near-IR", "Thermal-IR"): bands[band] = self.readBand(filePaths[band]) if shapefile: bands["Shape"] = self.readShapeFile(shapefile) if type(bands["Shape"]) == str: bands["Error"] = bands["Shape"] return bands return bands
def filter_files(self): fileset_name = request.params["fileset_name"] unique_id = request.params['unique_id'] course = h.get_object_or_404(Course, id=request.params["course_id"]) check_course_access(course.id) #is_solution = bool(int(request.params["is_solution"])) offering = h.get_object_or_404(Offering, id=request.params["offering_id"]) semester = offering.semester student_fileset = 0 solution_fileset = 1 base_fileset = 2 fileset_type = student_fileset if isinstance(semester, SolutionSemester): fileset_type = solution_fileset elif isinstance(semester, BaseSemester): fileset_type = base_fileset type = request.params["type"] if type == "svn": svnroot = str(request.params["svn_url"]) subdir = str(request.params["svn_subdir"]) username = str(request.params["svn_username"]) password = str(request.params["svn_password"]) rev_time = time.strptime(str(request.params['svn_rev_time']), "%Y-%m-%d %H:%M:%S") exported_path = h.svn_export(svnroot, username, password, subdir, rev_time, fileset_type>0,unique_id) h.update_session('import_status', "Post-processing files", unique_id) fileset = None if fileset_type == student_fileset: fileset = FileSet() elif fileset_type == solution_fileset: fileset = SolutionFileSet() fileset.isSolutionSet = True elif fileset_type == base_fileset: fileset = BaseFileSet() fileset.name = fileset_name fileset.subdir = subdir fileset.tempDir = exported_path fileset.course = course fileset.offering = offering fileset.isComplete = False c.fileset = fileset files = h.lsdir(fileset.tempDir) files = map(lambda x: x[len(fileset.tempDir)+1:], files) #chop off temp dir from names files.sort() c.files = files if type == "archive": h.update_session('import_status', "Processing archive file",unique_id) subdir = request.params["archive_subdir"] myfile = request.POST['file'] download_path = mkdtemp() path = os.path.join(download_path, myfile.filename.replace(os.sep, '_')) permanent_file = open(path,'wb') shutil.copyfileobj(myfile.file, permanent_file) myfile.file.close() permanent_file.close() extracted_directory_path = mkdtemp() h.update_session('import_status', "Extracting archive file",unique_id) if zipfile.is_zipfile(path): file = ZipFile(path, 'r') h.extract_zip(file, extracted_directory_path) if tarfile.is_tarfile(path): file = TarFile.open(path, 'r') #file.extractall(extracted_directory_path) removed for python 2.4 compat for member in file.getmembers(): file.extract(member, extracted_directory_path) h.update_session('import_status', "Post-processing files",unique_id) if fileset_type == student_fileset: fileset = FileSet() elif fileset_type == solution_fileset: fileset = SolutionFileSet() fileset.isSolutionSet = True elif fileset_type == base_fileset: fileset = BaseFileSet() fileset.name = fileset_name fileset.subdir = subdir fileset.tempDir = extracted_directory_path fileset.course = course fileset.offering = offering fileset.isComplete = False c.fileset = fileset files = h.lsdir(fileset.tempDir) files = map(lambda x: x[len(fileset.tempDir)+1:], files) #chop off temp dir from names files.sort() c.files = files shutil.rmtree(download_path) Session.commit() unique_files_dict = {} for filepath in c.files: filename = filepath.split("/")[-1] if not unique_files_dict.has_key(filename): unique_files_dict[filename] = 0 else: unique_files_dict[filename] = unique_files_dict[filename] + 1 c.unique_files = sorted(unique_files_dict.keys(), cmp=lambda x,y: cmp(unique_files_dict[x], unique_files_dict[y]), reverse=True) c.unique_files.insert(0, "None") h.del_from_session('import_status', unique_id) session.save() return render("/derived/import/filter_files.html")