def _safe_extract_all(self, zipfile, target_dir): """Safer version of ZipFile.extractall -- does not allow absolute or upwards-relative paths""" for zipinfo in zipfile.infolist(): # skip absolute or upwards-relative files if zipinfo.filename.startswith(('/', '..')): warnings.warn('Skipping potentially unsafe file: ' + zipinfo.filename, RuntimeWarning) continue # target_dir is base directory; extract will create subpaths as necessary zipfile.extract(zipinfo, target_dir)
def safe_extract_from_zip(zipfile, filename, out_dir): try: return zipfile.extract(filename, out_dir) except (KeyError) as e: pass return ''
def uncompress(srcfile, destdir): import gzip import tarfile, zipfile file = os.path.basename(srcfile) if os.path.isfile(file): shortname, fmt = os.path.splitext(file) fmt = fmt[1:] if fmt in ('tgz', 'tar'): try: tar = tarfile.open(srcfile) names = tar.getnames() for name in names: tar.extract(name, destdir) tar.close() except Exception as e: print("Can't uncompress {} for {}".format(file, e)) elif fmt == 'zip': try: zipfile = zipfile.ZipFile(srcfile) for names in zipfile.namelist(): zipfile.extract(names, destdir) zipfile.close() except Exception as e: print("Can't uncompress {} for {}".format(file, e)) elif fmt == 'gz': try: fname = os.path.join(destdir, os.path.basename(srcfile)) gfile = gzip.GzipFile(srcfile) open(fname, "w+").write(gfile.read()) # gzip对象用read()打开后,写入open()建立的文件中。 gfile.close() # 关闭gzip对象 except Exception as e: return False, e, fmt ''' elif fmt == 'rar': try: rar = rarfile.RarFile(srcfile) os.chdir(destdir) rar.extractall() rar.close() except Exception as e : return (False, e, filefmt) ''' else: print('文件格式不支持或者不是压缩文件') return None
def unzip(zip_file_path): try: zip_file = zipfile.ZipFile(zip_file_path, 'r') content_files = zip_file.namelist() for file_name in content_files: if os.path.splitext(file_name)[1] == '.dblog': zipfile.extract(zip_dir, file_name) return zip_dir + '\\' + file_name return '' # no debug log files found in the zip file except: if (os.path.getsize(zip_file_path) == 0): print('empty file') else: print("FAILED to extract: " + zip_dir + '\\' + zip_file_name) return ''
def arff_to_df(URL=URL_child, arff_file='Autism-Child-Data.arff', force_download=False): if force_download or not os.path.exists(arff_file): zipped = urlretrieve(URL, 'autism.zip') zipfile = ZipFile(zipped[0], 'r') arff_file = zipfile.extract(arff_file) ##extracting the data dictionary and column names (description) data, description = arff.loadarff(arff_file) columns = [i for i in description] new_columns = [ 'A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender', 'ethnicity', 'jaundice', 'autism', 'country_of_res', 'used_app_before', 'result', 'age_range', 'relation', 'Class/ASD' ] df = pd.DataFrame(data, columns=columns) df.columns = new_columns #changing utf-8 coding to categorical variables for column in list(df.select_dtypes(include=['object']).columns): df[column] = df[column].str.decode('utf-8').astype('category') #Fixing ethnicity discrepancies df.ethnicity = df.ethnicity.str.replace('?', "Unknown").str.replace( ' ', '_').str.replace('\'Middle_Eastern_\'', "Middle_Eastern").str.replace( '\'South_Asian\'', 'South_Asian').str.replace('others', 'Others').astype('category') #Fill in missing age with median age df.age = df.age.fillna(value=df.age.median()) #Fixing relation discrepancies relation_mapper = { 'Parent': 'Family Member', 'Relative': 'Family Member', '\'Health care professional\'': 'Health care professional', '?': 'Unknown', 'Self': 'Self', 'self': 'Self' } df.relation = df.relation.map(relation_mapper).astype('category') df.relation = df.relation.fillna("Unknown") #Fixing various binary inputs jaun_mapper = {'yes': 1, 'no': 0} aut_mapper = {'yes': 1, 'no': 0} class_mapper = {'YES': 1, 'NO': 0} app_mapper = {'yes': 1, 'no': 0} mapper_list = list([jaun_mapper, aut_mapper, class_mapper, app_mapper]) for x, y in zip(['jaundice', 'autism', 'Class/ASD', 'used_app_before'], mapper_list): df[x] = df[x].map(y) df = df.drop('country_of_res', axis=1) return df