def extractFiles(indir="/Users/Pratik/Documents/Pratik/Work/practice/py-data-analysis", out="/Users/Pratik/Documents/Pratik/Work/practice/py-data-analysis/extracted"): os.chdir(indir) # change directory archives = glob.glob("*.gz") # get all archive files that end in .gz if not os.path.exists(out): # if folder doesn't exist make it os.mkdirs(out) files = os.listdir("extracted") # get list of all the files currently in the directory for archive in archives: # loop through archives and extract files if archive[:-3] not in files: # if file is already in folder don't extract (cut out exten) patoolib.extract_archive(archive, outdir=out)
def get_starling_data(): if len(glob("../../data/processed/starling/delineations/*.img")) > 1: print("Data already download") return print("Downloading data") dl_output = "../../data/raw/starling/" # data download img_output = "../../data/processed/starling/" # processed save spot data_url = "http://uahost.uantwerpen.be/bioimaginglab/starling.zip" # ensure directories ensure_dir(dl_output) ensure_dir(img_output) ensure_dir(img_output + "delineations/") zip_loc = dl_output + "starling.zip" # download data tqdm_download(data_url, zip_loc) # extract the data patoolib.extract_archive(zip_loc, outdir=dl_output) # move the data to the correct location for img_file in np.concatenate( [glob(dl_output + "ATLAS_starling/*." + ed) for ed in ["img", "hdr"]]): shutil.copy(img_file, img_output + os.path.basename(img_file)) for img_file in np.concatenate([ glob(dl_output + "ATLAS_starling/delineations/*." + ed) for ed in ["img", "hdr", "txt"] ]): shutil.copy(img_file, img_output + "delineations/" + os.path.basename(img_file))
def _install_anime4kcpp(self): """ Install Anime4KCPP """ print('\nInstalling Anime4KCPP') import patoolib import requests # get latest release of Anime4KCPP via Github API # at the time of writing this portion, Anime4KCPP doesn't yet have a stable release # therefore releases/latest won't work latest_release = requests.get( 'https://api.github.com/repos/TianZerL/Anime4KCPP/releases/latest' ).json() for a in latest_release['assets']: if re.search(r'Anime4KCPP_CLI-.*-Win64-msvc\.7z', a['browser_download_url']): anime4kcpp_7z = download(a['browser_download_url'], tempfile.gettempdir()) self.trash.append(anime4kcpp_7z) # if running in PyInstaller, add sys._MEIPASS\7z to path # this directory contains 7za.exe and its DLL files with contextlib.suppress(AttributeError): os.environ['PATH'] += f';{sys._MEIPASS}\\7z' # (LOCALAPPDATA / 'video2x' / 'anime4kcpp').mkdir(parents=True, exist_ok=True) # pyunpack.Archive(anime4kcpp_7z).extractall(LOCALAPPDATA / 'video2x' / 'anime4kcpp') if (LOCALAPPDATA / 'video2x' / 'anime4kcpp').exists(): shutil.rmtree(LOCALAPPDATA / 'video2x' / 'anime4kcpp') patoolib.extract_archive(str(anime4kcpp_7z), outdir=str(LOCALAPPDATA / 'video2x' / 'anime4kcpp'))
def extract_archive_data(archive_data, archive_name, folder): """Extract and save data from compressed folder from config map. Args: archive_data (str): Base64 encoded jsonnet folder (archive). archive_name (str): Archive name (needed for extensions). folder (str): Folder that will be created and archive extracted to. Returns: None """ try: if not os.path.exists(folder): os.mkdir(folder) except OSError as e: log.error(f"Error when creating folder {folder}, error: {e}") return try: with open(archive_name, "wb") as f: f.write(base64.b64decode(archive_data)) except (binascii.Error, IOError) as e: log.error(f"Error when decoding {archive_name}, error: {e}") try: patoolib.extract_archive(archive_name, outdir=folder, verbosity=-1) log.info(f"File {archive_name} extracted to {folder}") except patoolib.util.PatoolError as e: log.error(f"Error when extracting {archive_name}, error: {e}") remove_file("./", archive_name)
def cbzgenerator(namefile, origen): logfile = origen + '/cbrconverter.log' parents, filename = os.path.split(namefile) temporal = parents + '/temporal' try: os.mkdir(temporal) except OSError: print("Creation of the directory %s failed" % temporal) print(namefile) try: patoolib.extract_archive(namefile, outdir=temporal) except: f = open(logfile, "a") f.write("Error descomprimiendo: " + namefile + '\n') f.close() try: shutil.rmtree(temporal) except OSError: print('Error while deleting directory') os.rename(namefile, namefile + ".extraido") archivos = glob.glob(temporal + '/**/*.*', recursive=True) archivos.sort() filename2, file_extension = os.path.splitext(filename) cbz = parents + '/' + filename2 + '.cbz.new' zipobje = ZipFile(cbz, 'w') for archivos2 in archivos: datemodified(archivos2) ruta, nombrearchivo = os.path.split(archivos2) zipobje.write(archivos2, basename(nombrearchivo)) zipobje.close() try: shutil.rmtree(temporal) except: print('Error while deleting directory')
def run(): # Get data directory from environment datadir = os.environ.get('NATURA_DATA_DIRECTORY', '') if not datadir: print('Datadir not found, please specify NATURA_DATA_DIRECTORY env var.') return # Get geographic files url = const.NATURA2000_SITES_SOURCE filepath = os.path.join(datadir, os.path.basename(url)) print('Downloading file', url, filepath) response = requests.get(url, stream=True) with open(filepath, "wb") as handle: for data in response.iter_content(chunk_size=1024): if data: handle.write(data) print('Unpacking file', filepath) patoolib.extract_archive(filepath, outdir=datadir) # Get tabular data url = const.NATURA2000_TABLE_SOURCE filepath = os.path.join(datadir, 'natura2000_tabular.zip') print('Downloading file', url, filepath) response = requests.get(url, stream=True) with open(filepath, "wb") as handle: for data in response.iter_content(chunk_size=1024): if data: handle.write(data) print('Unpacking file', filepath) patoolib.extract_archive(filepath, outdir=datadir)
def ftpDownloader(Id, startID, endID, url="<url here>", user="******", passwd="<password here>"): ftp = FTP(url) ftp.login(user, passwd) if not os.path.exists(pathname): os.makedirs(pathname) print(ftp.nlst()) ftp.cwd("<ftp working durectory here>") os.chdir(pathname) for array in range(startID, endID + 1): #Enter full path below, including start and stop IDs fullpath = '<insert ftp path here>' % (array, Id, array) filename = os.path.basename(fullpath) try: with open(filename, 'wb') as file: ftp.retrbinary('RETR %s' % fullpath, file.write) print("%s downloaded" % filename) if filename[-3:] == ".gz" or filename[ -4:] == ".zip" or filename[-4:] == ".tar": patoolib.extract_archive(filename, outdir="unpack") except error_perm: print("%s is not available" % filename) os.remove(filename) ftp.close()
def main(): top = tkinter.Tk() top.withdraw() archivepath = '' if (len(sys.argv) == 1): archivepath = askopenfilename(title="Selecteaza arhiva") else: archivepath = sys.argv[1] folderpath = os.path.dirname(archivepath) filename = '.'.join(os.path.basename(archivepath).split('.')[:-1]) destinationPath = folderpath if (len(sys.argv) == 3): destinationPath = sys.argv[2] else: destinationPath = askdirectory( title="Selecteaza directorul unde se vor dezarhiva fisierele.", initialdir=destinationPath) destFilesPath = os.path.join(destinationPath, DEST_FOLDER_PREFIX + filename) if (not os.path.exists(destFilesPath)): os.makedirs(destFilesPath) patoolib.extract_archive(archivepath, outdir=destFilesPath) exit()
def extract_archive(from_path, to_path=None, remove_finished=False): if to_path is None: to_path = os.path.dirname(from_path) if _is_tar(from_path): with tarfile.open(from_path, 'r') as tar: tar.extractall(path=to_path) elif _is_targz(from_path): with tarfile.open(from_path, 'r:gz') as tar: tar.extractall(path=to_path) elif _is_tarxz(from_path) and PY3: # .tar.xz archive only supported in Python 3.x with tarfile.open(from_path, 'r:xz') as tar: tar.extractall(path=to_path) elif _is_gzip(from_path): to_path = os.path.join( to_path, os.path.splitext(os.path.basename(from_path))[0]) with open(to_path, "wb") as out_f, gzip.GzipFile(from_path) as zip_f: out_f.write(zip_f.read()) elif _is_zip(from_path): with zipfile.ZipFile(from_path, 'r') as z: z.extractall(to_path) elif _is_rar(from_path): patoolib.extract_archive(from_path, outdir=to_path) else: raise ValueError("Extraction of {} not supported".format(from_path)) if remove_finished: os.remove(from_path)
def unpack(input_file, output_folder): """ :param input_file: file to unpack :param output_folder: folder to save file unpacked :return: Message of unpacked or no file detected """ print(input_file) if os.path.isdir(input_file): return False with open(input_file, "rb") as file: info = fleep.get(file.read(128)) extensions_supported = {'rar', '7z', 'dmg', 'gz', 'iso', 'tar.z', 'zip'} print(info.extension) if not info.extension: result = False elif set(info.extension) & extensions_supported: print(set(info.extension) & extensions_supported) print('---> File recognized, unpacking <---') try: patoolib.extract_archive(input_file, outdir=output_folder) # probably a false positive except PatoolError: return False result = True else: result = False return result
def decompress_file(file, dir, directories='strip'): fullcmd = None for ptr, cmd in DECOMPRESSORS.iteritems(): if re.search(ptr, file): fullcmd = cmd % locals() break # if fullcmd is not None: # lgr.debug("Extracting file: %s" % fullcmd) # status, output = getstatusoutput(fullcmd) # getstatusoutput is deprecated. Use cmd.Runner.run() instead. # if status: # lgr.debug("Failed to extract: status %d output %s" % (status, output)) # else: #lgr.debug("Have no clue how to extract %s -- using patool" % file) verbosity = -1 # silent by default ef_level = lgr.getEffectiveLevel() if ef_level and lgr.getEffectiveLevel() <= logging.DEBUG: verbosity = 1 #elif lgr.getEffectiveLevel() <= logging.INFO: # verbosity = 0 patoolib.extract_archive(file, outdir=dir, verbosity=verbosity) if directories == 'strip': _, dirs, files = os.walk(dir).next() if not len(files) and len(dirs) == 1: # move all the content under dirs[0] up 1 level subdir, subdirs_, files_ = os.walk(join(dir, dirs[0])).next() for f in subdirs_ + files_: os.rename(join(subdir, f), join(dir, f)) else: raise NotImplementedError("Not supported %s" % directories)
def extra_file(file_path): file_extra_dir = file_path[0:file_path.rfind(os.path.sep)] print("file_extra_dir: ", file_extra_dir) if not os.path.exists(file_extra_dir): os.mkdir(file_extra_dir) patoolib.extract_archive(file_path, outdir=file_extra_dir) return file_extra_dir
def writing_to_BD(): url = 'http://programtv.ru/xmltv.xml.gz' response = requests.get(url) xml_gz = open( os.path.dirname(os.path.realpath(__file__)) + '\\xmltv.xml.gz', 'wb') xml_gz.write(response.content) xml_gz.close() os.chdir(os.path.dirname(os.path.realpath(__file__))) patoolib.extract_archive( os.path.dirname(os.path.realpath(__file__)) + '\\xmltv.xml.gz', outdir=os.path.dirname(os.path.realpath(__file__))) with open("xmltv.xml", encoding='utf-8') as fobj: xml = fobj.read().encode('utf-8') root = etree.fromstring(xml) for elem in root: if elem.tag == "channel": num_of_channels = session.query(BD.Channel.name).filter( BD.Channel.id_channel == elem.get('id')).count() if num_of_channels == 0: table_channel = BD.Channel(id_channel=elem.get('id'), name=elem[1].text) session.add(table_channel) #session.commit() if elem.tag == "programme": num_of_id_telecasts = session.query(BD.Telecast.id).filter( BD.Telecast.name == elem[0].text).count() if num_of_id_telecasts == 0: table_telecast = BD.Telecast(name=elem[0].text) session.add(table_telecast) #session.commit() num_of_id_genres = session.query( BD.Genre.id).filter(BD.Genre.name == elem[1].text) if num_of_id_genres == 0: table_genre = BD.Genre(name=elem[1].text) session.add(table_genre) #session.commit() start = parser.parse(elem.get('start')) start = start.strftime("%Y-%m-%d %H:%M:%S") end = parser.parse(elem.get('stop')) end = end.strftime("%Y-%m-%d %H:%M:%S") id_telecast = session.query(BD.Telecast.id).filter( BD.Telecast.name == elem[0].text).first() table_tvprogram = BD.TVprogram(channel=elem.get('channel'), telecast=id_telecast, start_time=start, end_time=end) session.add(table_tvprogram) #session.commit() session.commit()
def extract_rar_data(source, target, extract_src = False): if extract_src: for file in os.listdir(source): if file.endswith(".rar"): patoolib.extract_archive(source+file, outdir = target) allrar = [y for x in os.walk(target) for y in glob(os.path.join(x[0], '*.rar'))] for file in allrar: patoolib.extract_archive(file, outdir = target)
def uncompressFile(self,from_location,to_location): if from_location.endswith(".zip"): zfile = zipfile.ZipFile(from_location) zfile.extractall(to_location) print("uncompressing:" + from_location) elif from_location.endswith(".rar"): patoolib.extract_archive(from_location, outdir=to_location)
def extract_to_dir(archive_path, to_dir): """Extract the content of an archive in to_dir. archive_path (string): path of the archive to extract. to_dir (string): destination directory. """ patoolib.extract_archive(archive_path, outdir=to_dir, interactive=False)
def _extract(self): try: if not os.path.isdir(self.e_path): os.mkdir(self.e_path) patoolib.extract_archive(self.a_path, outdir=self.e_path) print("FILES EXTRACTED") except Exception as e: print(f"Failed: {str(e)}")
def uncompressFile(from_location, to_location): if from_location.endswith(".zip"): zfile = zipfile.ZipFile(from_location) zfile.extractall(to_location) print("uncompressing:" + from_location) elif from_location.endswith(".rar"): patoolib.extract_archive(from_location, outdir=to_location)
def extract_to_dir(archive_path, to_dir): """Extract the content of an archive in to_dir. archive_path (string): path of the archive to extract. to_dir (string): destination directory. """ patoolib.extract_archive(archive_path, outdir=to_dir)
def unpack_photo_archive(self, rar_url): archive_dir = Path(self.storage, r'archive') try: os.mkdir(archive_dir) except FileExistsError: self.clean_directory(archive_dir) patoolib.extract_archive(rar_url, outdir=archive_dir) return archive_dir
def convert_rar_to_txt(download_path): # 解压内容放到同名的目录 output_dir = download_path[:download_path.rfind('.')] if not os.path.exists(output_dir): os.mkdir(output_dir) # 如果文件夹下为空 解压 if len(os.listdir(output_dir)) < 1: patoolib.extract_archive(download_path, outdir=output_dir) # 检查是不是多了一层目录 if len(os.listdir(output_dir)) == 1: for extra in os.listdir(output_dir): if extra.endswith('.DS_Store'): continue else: # 多余的一层目录 把下面的文件考出来 然后目录删掉 tobe_del_dir = os.path.join(output_dir, extra) if os.path.isdir(tobe_del_dir): for pdf_file in os.listdir(tobe_del_dir): if not pdf_file.endswith('.DS_Store'): # 拷贝到父目录 shutil.move(os.path.join(tobe_del_dir, pdf_file), os.path.join(output_dir, pdf_file)) # 删除 os.removedirs(tobe_del_dir) # 筛选 logging.info(output_dir) for pdf_file in os.listdir(output_dir): if os.path.isfile(os.path.join(output_dir, pdf_file)): if pdf_file.endswith('.DS_Store'): continue elif pdf_file.endswith('.tif'): # 图片合并转化 merge_dir_pic_to_txt(output_dir) # 图片转化完成后 就break break elif pdf_file.endswith('.jpg') or pdf_file.endswith('.png'): # 图片合并转化 merge_dir_pic_to_txt(output_dir) # 图片转化完成后 就break break elif pdf_file.endswith('.doc') or pdf_file.endswith('.docx'): if need_to_save(pdf_file): doc_path = os.path.join(output_dir, pdf_file) convert_doc_to_txt(doc_path) elif pdf_file.endswith('.pdf'): if need_to_save(pdf_file): pdf_path = os.path.join(output_dir, pdf_file) convert_pdf_to_txt(pdf_path) elif pdf_file.endswith('.txt'): continue else: logging.info('解压出来的文件不知道怎么处理 %s/%s' % (output_dir, pdf_file)) logging.info('') return
def _analyze_compressed_file(parent, node, path, nesting_level): m_type = mime.from_file(path) size = os.path.getsize(path) m = md5() s = sha1() with open(path, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): m.update(chunk) s.update(chunk) str_md5 = m.hexdigest() str_sha1 = s.hexdigest() str_fuzzy = ssdeep.hash_from_file(path) node['filename'] = os.path.basename(path) node['mime_type'] = m_type node['size'] = size node['md5'] = str_md5 node['sha1'] = str_sha1 node['fuzzy'] = str_fuzzy node['nesting_level'] = nesting_level + 1 str_fuzzy = ssdeep.hash_from_file(path) node['fuzzy'] = str_fuzzy if parent is None: node['parent_hash'] = None else: node['parent_hash'] = parent.get('sha1') node['compressed_children'] = [] # If this is a compressed file, analyze it recursively. This means we need to create a new directory, uncompress # files there and calculate hashes. Then, delete the extracted files when done. # zip, x-tar, x-7z-compressed, x-rar, vnd.ms-cab-compressed, gzip, x-bzip2, x-7z-compressed tmpdir = tempfile.mkdtemp() try: # Brute force approach: we don't even check the mime file. We try to unpack evey archive. # Extract all the files patoolib.extract_archive(path, outdir=tmpdir) # Analyze each file files = [ os.path.join(tmpdir, f) for f in os.listdir(tmpdir) if os.path.isfile(os.path.join(tmpdir, f)) ] for f in files: child = dict() _analyze_compressed_file(parent=node, node=child, path=f, nesting_level=nesting_level + 1) node['compressed_children'].append(child) except: pass finally: # Remove the temporary file directory shutil.rmtree(tmpdir)
def extractFiles(indir = "weather-data", out="extracted"): os.chdir(indir) archives = glob.glob("*.gz") if not os.path.exists(out): os.makedirs(out) files = os.listdir("extracted") for archive in archives: if archive[:-3] not in files: patoolib.extract_archive(archive, outdir = out)
def download_extract_all(urls, dir_save, extract_data=True, verbose=True): """Download urls + extract files to disk. Download + extract all url files to disk. If clean_cache is True, it removes the download files. Parameters ---------- urls : list/tuple List/tuple of URL paths. dir_save : str Directory to store the downloaded data. extract_data : bool, optional Extracts/unpacks the data files (if true). verbose : bool, optional Display messages on screen if set to True. Raises ------ Exception If it is an invalid url type. """ # Check if urls is a str if isinstance(urls, str): urls = [urls] # check if the save directory exists if not os.path.exists(dir_save): os.makedirs(dir_save) # download + extract data and remove temporary files for i, url in enumerate(urls): if verbose: print('\nDownload url ({}/{}): {}'.format(i + 1, len(urls), url)) url, md5hash, filename, extract_dir, method = parse_url(url) save_dir = os.path.join(dir_save, extract_dir) filename = os.path.join(save_dir, filename) if not os.path.exists(save_dir): os.makedirs(save_dir) if os.path.exists(filename): print('File already exists, skip downloading this url.') else: status, err = download_url(url, filename, method, verbose) if not status: raise Exception(err) if md5hash: md5_checksum(filename, md5hash) if extract_data: patoolib.extract_archive(filename, outdir=save_dir, verbosity=verbose)
def decompress_to_temp(self, file): try: patoolib.extract_archive(archive=os.path.join(file), verbosity=0, outdir=self.TEMPDIR) except Exception as e: print('Decompression failed. Exception Thrown: ' + str(e)) self.logfile.log_error('Decompression failed. Exception Thrown: ' + file + str(e)) return False else: return True
def unpack(self): """Extract archive's content to a temporary directory. return (string): the path of the temporary directory. """ self.temp_dir = tempfile.mkdtemp(dir=config.temp_dir) patoolib.extract_archive(self.path, outdir=self.temp_dir) return self.temp_dir
def extractFiles(indir='/home/oleh/PYDATA', out='/home/oleh/PYDATA/Extracted'): os.chdir(indir) archives = glob.glob("*.gz") if not os.path.exists(out): os.makedirs(out) files = os.listdir("Extracted") for archive in archives: if archive[:-3] not in files: patoolib.extract_archive(archive, outdir=out)
def rar_cvrt(): test_folder = os.path.join(PARENT_FOLDER, 'testrar') for file_name in os.listdir(test_folder): file_path = os.path.join(test_folder, file_name) if file_name.endswith('.rar'): extract_dir = file_path[:file_path.rfind('.')] if not os.path.exists(extract_dir): os.mkdir(extract_dir) patoolib.extract_archive(file_path, outdir=extract_dir)
def ExtractRarFile(self, RarFileName, destPath): """Rar file extraction procedure Args: self: The reserved object 'self' RarFileName: Extractable file path + name destPath: Destination path for extracted files """ patoolib.extract_archive(RarFileName, outdir=destPath)
def extract(self, src, dest): #make destination directory first, or else patool complains #also change replay try: os.mkdir(dest) except OSError: pass extract_archive(src, outdir=dest)
def extractFiles(indir="/Users/Pratik/Documents/Pratik/Work/practice/py-data-analysis", out="/Users/Pratik/Documents/Pratik/Work/practice/py-data-analysis/extracted"): os.chdir(indir) archives = glob.glob("*.gz") if not os.path.exists(out): os.mkdirs(out) files = os.listdir("extracted") for archive in archives: if archive[:-3] not in files: patoolib.extract_archive(archive, outdir=out)
def scan_archives(self): for file_path in utils.get_files(self.unsorted_path): if utils.is_compressed(file_path, self.extensions): patoolib.extract_archive( file_path, outdir=self.unsorted_path, verbosity=-1 ) self.process_response(['info', f'Extracting {file_path}']) response = utils.delete_file(file_path) self.process_response(response)
def download(): """ Download Tourism dataset. """ if os.path.isdir(DATASET_PATH): logging.info(f'skip: {DATASET_PATH} directory already exists.') return download(DATASET_URL, DATASET_FILE_PATH) patoolib.extract_archive(DATASET_FILE_PATH, outdir=DATASET_PATH)
def _load_data(self, rdir, infos, filestring): self.X = np.zeros((0, 250000)) ## w for idx, info in enumerate(infos): # directory to put the raw rar file rawdir = os.path.join(rdir, 'raw') self._mkdir(rawdir) # path to find the file fpath = os.path.join(rawdir, info[0] + '.rar') # if file already exists, avoid duplicate downloads if not os.path.exists(fpath): print("no dir/file") self._download(fpath, info[3].rstrip('\n')) # compressed file to uncompress cmpfile = rawdir + '/' + info[0] + '.rar' print("file to exrtract is is::") print(cmpfile) # unpack file if not os.path.exists(rdir + '/' + info[0]): pa.extract_archive(cmpfile, outdir=rdir, program=rarpath) else: print("file already extracted, skipping unrar") # a list of all files in the extracted dir ddir = rdir + '/' + info[0] flist_all = os.listdir(ddir) # print("filelist:") # print(flist_all) # use the searchstring, build from the program arguments to find files of interest flistsorted = [i for i in flist_all if filestring in i] print("sorted filelist:") print(flistsorted) # now build the dataset from all files of interest # iterate through the filelist for f in flistsorted: # load matlab file mat_dict = loadmat(ddir + '/' + f) #,struct_as_record=False) # get the values key, tha name of thenactual dataset equal to filename #key = list(filter(lambda x: 'N15_M07_F04_' in x, mat_dict.keys())) key = list(filter(lambda x: filestring in x, mat_dict.keys())) # load data #time_series = mat_dict[key[0]][:, 0] #['Y'] time_series = mat_dict[key[0]]['Y'][0, 0][0, 6][2][:][0] self.X = np.vstack((self.X, time_series[0:250000]))
def extract() -> None: try: os.mkdir(os.path.join(cd, "ttsdk")) except FileExistsError: shutil.rmtree(os.path.join(cd, "ttsdk")) os.mkdir(os.path.join(cd, "ttsdk")) patoolib.extract_archive( os.path.join(cd, "ttsdk.7z"), outdir=os.path.join(cd, "ttsdk") )
def scan_archives(self): for file_path in utils.get_files(self.unsorted_path): if utils.is_compressed(file_path, self.extensions): patoolib.extract_archive(file_path, outdir=self.unsorted_path, verbosity=-1) self.process_response(['info', f'Extracting {file_path}']) response = utils.delete_file(file_path) self.process_response(response)
def download_delta(self): """ Функция последовательно: 1) Проверяет наличие новых дельт, 2) Скачивает архивы дельт 3) Распаковывает архивы, получая XML для каждой из таблиц 4) Вызывает функцию загрузки каждого из файлов""" # I. Запрашиваем информацию о последних дельтах # Подробнее на: https://fias.nalog.ru/WebServices/Public/DownloadService.asmx?op=GetLastDownloadFileInfo envelope = """<?xml version="1.0" encoding="utf-8"?> <soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"> <soap:Body> <GetAllDownloadFileInfo xmlns="https://fias.nalog.ru/WebServices/Public/DownloadService.asmx" /> </soap:Body> </soap:Envelope>""".encode('utf-8') headers = {'Host': 'fias.nalog.ru', 'Content-Type': 'text/xml; charset=utf-8', 'Content-Length': str(len(envelope))} response = requests.post(url='https://fias.nalog.ru/WebServices/Public/DownloadService.asmx', headers=headers, data=envelope) tree = ElementTree.fromstring(response.content) ns = {'ns': 'https://fias.nalog.ru/WebServices/Public/DownloadService.asmx'} # II. Получаем последнюю загруженную версию дельты current_version = 574 # TODO: current_version брать из базы # III. Итерируемся по каждой записи из списка дельт for el in tree.findall('.//ns:DownloadFileInfo', ns): version = int(el.find('.//ns:VersionId', ns).text) delta = el.find('.//ns:FiasDeltaXmlUrl', ns).text file_name = 'fias_delta_xml.rar' # Если версия дельты больше загруженной: if version > int(current_version): current_version = str(version) # TODO: current_version записывать в базу # 1) Скачиваем и сохраняем архив с XML-файлами response = requests.get(delta) rar = os.path.join(settings.FIAS_DIR, current_version, file_name) os.makedirs(os.path.dirname(rar), exist_ok=True) with open(rar, 'wb') as f: f.write(response.content) # 2) Распаковываем архив patoolib.extract_archive(rar, outdir=os.path.join(settings.FIAS_DIR, current_version), interactive=False) # 3) Вызываем функцию загрузки для каждого из файлов for file in self.models.keys(): self.load_to_db(file=file, delta=current_version)
def unzip(self, base_dir): for root, dirs, files in os.walk(base_dir): if files: for i in files: if i.endswith(self.extensions): #print os.path.join(root,i) Archive(os.path.join(root,i)).extractall(os.path.join(root)) elif i.endswith('rar'): print i extract_archive(os.path.join(root,i), outdir=os.path.join(root))
def unrar_files(root, files): for file in files: filename, file_extension = os.path.splitext(file) if (file_extension == ".rar"): print("rar file: ", file) file_to_extract = os.path.join(root, file) print ("to extract: ", file_to_extract) patoolib.extract_archive(file_to_extract, outdir=root, program = rar_program, interactive=False) if (".part" in file): break
def extractFiles(indir="C:\\dest",out="C:\\dest\\Extracted"): os.chdir(indir) archives=glob.glob("*.gz") print (archives) if not os.path.exists(out): os.makedirs(out) files=os.listdir("Extracted") print(files) for archive in archives: if archive[:-3] not in files: patoolib.extract_archive(archive,outdir=out)
def unpackRAR(date): filename = date + '.rar' patoolib.extract_archive(filename, outdir=".") # 将原文件移动到新的目录‘newdata’中 oldfilepath = "./ProcessFile/Stk_Day/Stk_Day_Idx_Daily/%s.csv" % date newfilepath = "./newdata/" shutil.move(oldfilepath, newfilepath) # 删除rar归档及ProcessFile shutil.rmtree("./ProcessFile") os.remove(filename)
def unpackTo(self, outdir): # if directory exists and has files we assume that we already successfully extracted the archive if os.path.isdir(outdir): for dirpath, dirnames, files in os.walk(outdir): if len(files): return # make sure the outdir exists, but is empty createDirectory(outdir, deleteBefore=True) try: patoolib.extract_archive(self.filename, outdir=outdir) except: shutil.rmtree(outdir) raise
def unzip(): db = conn() filename = request.args.get('o_id') parts_dir = os.path.join(current_app.config['UPLOAD'], "%s" % filename.split('.')[0]) destination_path = os.path.join(parts_dir, "%s" % (filename)) patoolib.extract_archive(destination_path,outdir=parts_dir) for root, dirs, files in os.walk(os.path.join(parts_dir, "%s" % (filename.split('.')[0]))): for name in files: if name == 'ENBCFG.XML.gz': patoolib.extract_archive(os.path.join(root, name),outdir=root) # print(os.path.join(root, 'ENBCFG.XML')) xmlImport.xml_import(db, os.path.join(root, 'ENBCFG.XML'), filename.split('.')[0]) db['ManagerInfo'].update({'_id':filename},{'$set':{'operator':'未分析','status':'已导入'}}) data = db['ManagerInfo'].find().sort('_id') return render_template('upload.html', data=data)
def convert(min_length, extract = True, delete_tmp = False, zip_after = False): none_type_counter = 0 files_counter = 0 os.chdir(main_dir) if not extract: none_type_counter, files_counter = convert_all_html_files(min_length) else: files_years = os.listdir(os.curdir) for file_rar in files_years: if '.rar' in file_rar and os.path.isfile(file_rar): if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) else: os.makedirs(tmp_dir) patoolib.extract_archive(file_rar, outdir = tmp_dir) new_ntc, new_fc = convert_all_html_files(min_length) none_type_counter += new_ntc files_counter += new_fc logging.info('Zakonczono konwertowanie') logging.info('Liczba przekonwertowanych plikow {0}'.format(files_counter)) logging.info('Liczba plikow konczacych sie na NoneType, zamiast <META> to: {0}'.format(none_type_counter)) os.chdir('../') if zip_after: try: with tarfile.open(file_zip_after, "w:gz") as tar: tar.add(results_dir, arcname = os.path.basename(results_dir)) logging.info('Spakowano efekty do {0}'.format(file_zip_after)) except: logging.error('Wystapil blad przy pakowaniu wynikow pracy') zip_after = False if delete_tmp: try: shutil.rmtree(main_dir) if zip_after: shutil.rmtree(results_dir) logging.info('Skasowane pliki tymczasowe') except: logging.error('Wystapil blad przy kasowaniu tymczasowych plikow')
def unpack(path): for subdir, dirs, files in os.walk(path): print 'subdir[' + subdir + ']' print 'dirs[' + ",".join(dirs) + ']' print 'files[' + ",".join(files) + ']' for file in files: f = os.path.join(subdir, file) print 'file[' + f + ']' if os.path.isfile(f): print 'file[' + f + ']' if file.endswith('gz'): fo = f[:-3] with gzip.open(f, 'rb') as f_in, open(fo, 'wb') as f_out : shutil.copyfileobj(f_in, f_out) f_in.close(); f_out.close(); os.remove(f) print 'gunzip ' + fo return False elif file.endswith('tar'): tfile = tarfile.open(f, 'r') tfile.extractall(subdir) tfile.close() os.remove(f) print 'untar ' + f return False elif file.endswith('zip'): zip_ref = zipfile.ZipFile(f) # create zipfile object zip_ref.extractall(subdir) # extract file to dir zip_ref.close() # close file os.remove(f) # delete zipped file print 'unzip ' + f return False elif file.endswith('rar'): #rar_ref = rarfile.RarFile(f) #rar_ref.extractall(subdir) #rar_ref.close() patoolib.extract_archive(f, outdir=subdir) os.remove(f) print 'unrar ' + f return False else: unpack(file) print 'unpack done!!' return True
def __extract(self): target = self.data['target'] source_volume = self.get_volume(target) archive_file = source_volume.get_info(target) archive_file_path = source_volume._find_path(target) archive_name = archive_file_path.split('/')[-1].split('.')[0] folder_path = os.path.join( source_volume._find_path(archive_file.get('phash')), archive_name ) self.get_volume(archive_file.get('phash')).mkdir(archive_name, archive_file.get('phash')) patoolib.extract_archive(archive_file_path, outdir=folder_path, interactive=False) added = [] for node in source_volume.get_tree(archive_file.get('phash')): if source_volume._find_path(node['hash']) == folder_path: added.append(node) self.response.update({"added": added})
def extractFiles(inDir, outDir): os.chdir(inDir) #Create a list of gz files in directory archives = glob.glob("*.gz") #Check to see if outDir exists, if not create it if not os.path.exists(outDir): os.makedirs(outDir) #Assign list of files in directory that have already been extracted files = os.listdir(outDir) #Unpack archiver files for archFiles in archives: #only if file doesn't already exist, extract it if archFiles[:-3] not in files: patoolib.extract_archive(archFiles, outdir=outDir)
def extract_nested(folder): """ Unzip, untar, unrar, or whatever any file found in the student submission. """ import patoolib supported_suffixes = ('.zip', '.rar', '.tar.gz', '.tgz', '.tar.bz2', '.tar.xz', '.7z', '.tar') for root, dirs, files in os.walk(folder): for f in files: if f.endswith(supported_suffixes): try: archive = os.path.join(root, f) vprint('Extracting archive: "%s"' % archive) patoolib.extract_archive(archive, verbosity=-1, interactive=False, outdir=root) os.remove(archive) except patoolib.util.PatoolError as e: print(e, file=sys.stderr) print('Failed to extract "%s"' % archive, file=sys.stderr)
def run(unpack=True): # Get data directory from environment datadir = os.environ.get('CORINE_DATA_DIRECTORY', '') if not datadir: print('Datadir not found, please specify CORINE_DATA_DIRECTORY env var.') return for url in const.SOURCE_URLS_18_4: filepath = os.path.join(datadir, os.path.basename(url)) print('Downloading file', url, filepath) response = requests.get(url, stream=True) with open(filepath, "wb") as handle: for data in response.iter_content(chunk_size=1024): if data: handle.write(data) if unpack: print('Unpacking file', filepath) patoolib.extract_archive(filepath, outdir=datadir)
def archive_extractor(input_dir, output_dir): os.chdir(input_dir) for dirname, dirnames, filenames in os.walk(input_dir): for filename in filenames: print(filename) if filename.endswith('zip') or filename.endswith('rar'): try: patoolib.extract_archive(filename, outdir=output_dir) print(filename, 'extracted') first_sub_dir = next(os.walk('.'))[1][0] sub_dir = input_dir + '\\' + first_sub_dir print(sub_dir) new_output = sub_dir + '\output' if not os.path.exists(new_output): os.makedirs(new_output) archive_extractor(sub_dir, new_output) except PatoolError as e: print(e) os.chdir('..')
def __unzip_to_dir(self,zippedfile,dest_directory): ret = True try: #unzip into directory(make it if necessary) tmpZip = zipfile.ZipFile(zippedfile) tmpZip.extractall(dest_directory) self.logger.info("Extracting files to:"+dest_directory) except: self.logger.debug("Error Unzipping zip:" + zippedfile + "\ntrying rar:", exc_info=True) filenameonly,file_ext = os.path.splitext(zippedfile) if file_ext == '.rar': try: #OK lets try .rar extractor #first create directory if not os.path.exists(dest_directory): os.makedirs(dest_directory) self.logger.info(" Unzip failed trying .rar file to:"+dest_directory) #then extract patoolib.extract_archive(zippedfile,verbosity=1, outdir=dest_directory) except: self.logger.debug("Error cannot extract rar file", exc_info=True) ret= False return ret
def _archive_extract (self, archive, check, verbosity=0): # create a temporary directory for extraction tmpdir = patoolib.util.tmpdir(dir=basedir) try: olddir = patoolib.util.chdir(tmpdir) try: output = patoolib.extract_archive(archive, program=self.program, verbosity=verbosity) if check: self.check_extracted_archive(archive, output, check) finally: if olddir: os.chdir(olddir) finally: shutil.rmtree(tmpdir)
def ftpDownloader(Id,startID,endID,url="<url here>",user="******",passwd="<password here>"): ftp=FTP(url) ftp.login(user,passwd) if not os.path.exists(pathname): os.makedirs(pathname) print(ftp.nlst()) ftp.cwd("<ftp working durectory here>") os.chdir(pathname) for array in range(startID, endID+1): #Enter full path below, including start and stop IDs fullpath='<insert ftp path here>' % (array,Id,array) filename=os.path.basename(fullpath) try: with open(filename,'wb') as file: ftp.retrbinary('RETR %s' % fullpath, file.write) print("%s downloaded" % filename) if filename[-3:] == ".gz" or filename[-4:] == ".zip" or filename[-4:] == ".tar": patoolib.extract_archive(filename,outdir="unpack") except error_perm: print("%s is not available" % filename) os.remove(filename) ftp.close()
def archive2dir(archive, remove_dir_structure, out_dir): if remove_dir_structure: result_dir = os.path.join(out_dir, str(uuid.uuid4())) create_dirs(result_dir) # make temporary directory tempdir = tempfile.mkdtemp() # extract archive to temporary directory patoolib.extract_archive(archive, outdir=tempdir) # copy extracted files to output dir files = get_files(tempdir, recursive=True) for f in files: fo = out_file_name(result_dir, f) # don't copy if it's the same file if os.path.abspath(f) != fo: shutil.copy2(f, fo) # remove temporary directory and its contents shutil.rmtree(tempdir) else: # extract archive to temporary directory patoolib.extract_archive(archive, outdir=out_dir)
def check_created_archive_with_diff(self, archive, srcfiles): """Extract created archive again and compare the contents.""" # diff srcfile and output diff = patoolib.util.find_program("diff") if not diff: return program = self.program # special case for programs that cannot extract what they create if self.program == 'compress': program = 'gzip' elif self.program == 'zip': program = 'unzip' elif self.program == 'lcab': program = 'cabextract' elif self.program == 'shar': program = 'unshar' elif self.program == 'genisoimage': program = '7z' tmpdir = patoolib.util.tmpdir(dir=basedir) try: olddir = patoolib.util.chdir(tmpdir) try: output = patoolib.extract_archive(archive, program=program) if len(srcfiles) == 1: source = os.path.join(datadir, srcfiles[0]) patoolib.util.run_checked([diff, "-urN", source, output]) else: for srcfile in srcfiles: source = os.path.join(datadir, srcfile) target = os.path.join(output, srcfile) patoolib.util.run_checked([diff, "-urN", source, target]) finally: if olddir: os.chdir(olddir) finally: shutil.rmtree(tmpdir)
archive_suffix = ['.rar', '.zip', '.7z', '.tar', '.gz', '.tgz', '.tar.gz', 'xz', '.bz2'] def unpack_file(zipfile, outpath='.'): tmpdir = os.path.join(outpath, TMP_DIR) try: if not os.path.exists(outpath): os.mkdir(outpath) os.mkdir(tmpdir) except OSError, e: print "mkdir %s failed: %s" %(tmpdir, str(e)) return -1 try: patoolib.extract_archive(zipfile, outdir=tmpdir) except patoolib.PatoolError: # TODO: mark zipfile in RED print "unpack file %s failed!" %zipfile return -1 else: all_files = os.listdir(tmpdir) if len(all_files) == 1: shutil.move(os.path.join(tmpdir, all_files[0]), outpath) os.rmdir(tmpdir) elif len(all_files) > 1: barename = os.path.basename(zipfile) for suffix in archive_suffix: barename = barename.split(suffix)[0] newdir = os.path.join(outpath, barename) os.rename(tmpdir, newdir)
def __extractor__extract_rar(self, dst_dir_path): patoolib.extract_archive(self.archive_path, outdir=dst_dir_path)
def extract_archive(filename, directory): patoolib.extract_archive(filename, outdir=directory)