def FindInvalidImg(path, delflag=0, exclude_files='', exclude_exts=''): '''查找非法图像文件 @param path ---- 搜索路径 @param delflag --- 删除标记,0-忽略,1-删除满足条件的搜索文件 @param exclude_files --- 排除文件列表,如:"db.json,num.txt" @param exclude_exts ---- 排除扩展名列表,如:".json,.txt" @return list 如: ['/temp/a.jpg','/temp/a/b.jpg'] 使用范例: files=FindInvalidImg('./temp',0,"db.json,num.txt",".json,.txt") ''' print('FindInvalidImg(%s,delflag=%d)' % (path, delflag)) invalid_files = [] print('GatherFiles(%s,exts=' ',exclude_files="%s",exclude_exts="%s")' % (path, exclude_files, exclude_exts)) files = GatherFiles(path, exclude_files=exclude_files, exclude_exts=exclude_exts) nFiles = len(files) pb = ProcessBar.ShowProcess(100, 'Inv Search', '', 'OK') print('Search results:') for i, sfile in enumerate(files): img = cv2.imread(sfile) if img is None: invalid_files.append(sfile) if delflag == 1: os.remove(sfile) if i % 50 == 0: pb.show_process(int(i * 100 / nFiles)) pb.show_process(100) return invalid_files
def GetSameFiles(self): #构造数据表 df = pd.DataFrame(columns=['sfile', 'fsize', 'md5']) #收集文件 print('收集文件列表') files = funs.GatherFiles(self.path, exts=self.exts) print('计算文件特征') pb = ProcessBar.ShowProcess(100, '', '', infoDone='Done') #计算文件特征 nFiles = len(files) for i, sfile in enumerate(files): #获取文件特征 feats = self.get_feature(sfile) #添加一行 df.loc[i] = feats if i % 50 == 0: pb.show_process(int(i * 100 / nFiles)) pb.show_process(100) #汇总 files_same = [] df_group = df.groupby(['fsize', 'md5']) for name, group in df_group: if group.shape[0] > 1: files_same.append(group.loc[:, 'sfile']) return files_same
def FindGIFFiles(path, delflag=0, unzipflag=0): '''查找GIF文件 @param path -------- 搜索目录 @param delflag ----- 删除标记,0-保留GIF文件,1-删除搜索到的GIF文件 @param unzipflag --- 图像提取标记,0-忽略操作,1-把GIF的图像序列提取到同目录下,文件命名为:sfile_{n}.png @return list 如:['/temp/a.gif','/temp/b.gif'] 使用范例: files=FindGIFFIles('/temp',0,0) ''' print('FindGIFFIles(%s,delflag=%d,unzipflag=%d)' % (path, delflag, unzipflag)) gif_files = [] print('GatherFiles(%s,exts="")' % (path)) files = GatherFiles(path) nFiles = len(files) pb = ProcessBar.ShowProcess(100, 'GIF Search', '', 'OK') for i, sfile in enumerate(files): #gif文件处理 if imghdr.what(sfile) == 'gif': gif_files.append(sfile) if unzipflag == 1: gif2png(sfile, os.path.split(sfile)[0]) if delflag == 1: os.remove(sfile) if i % 50 == 0: pb.show_process(int(i * 100 / nFiles)) pb.show_process(100) return gif_files
def DelSameFiles(self): #检索相同文件 files_same = self.GetSameFiles() print('删除相同文件') pb = ProcessBar.ShowProcess(100, '', '', infoDone='Done') #删除相同文件 nGroups = len(files_same) for i, files_sub in enumerate(files_same): for sfile in files_sub[1:]: os.remove(sfile) pb.show_process(int(i * 100 / nGroups)) pb.show_process(100)
def CreateThumbs(path, ori_name, thumb_name, exts='', exclude_files='', exclude_exts='', width=100, height=100): '''创建缩略图 @param path 相册根目录 @param ori_name 原始图像目录名称 @param thumb_name 缩略图像目录名称 @param width 缩略图宽度 @param height 缩略图高度 ''' ori_path = '%s/%s' % (path, ori_name) #原始图像路径 thumb_path = '%s/%s' % (path, thumb_name) #缩略图像路径 if not os.path.exists(ori_path): print('%s not exists!' % (ori_path)) else: #搜集原始文件列表 ori_files = [] funs.GatherFilesEx(ori_path, ori_files, exts=exts, exclude_files=exclude_files, exclude_exts=exclude_exts) files_num = len(ori_files) #支持视频格式 vedio_exts = '.mp4' #创建缩略图 pbar = ProcessBar.ShowProcess() rep_ori_name = '/%s/' % (ori_name) #原始目录标记 rep_thumb_name = '/%s/' % (thumb_name) #缩略目录标记 for i, sfile in enumerate(ori_files): src_file = sfile dst_file = src_file.replace(rep_ori_name, rep_thumb_name) if os.path.exists(src_file) and not os.path.exists(dst_file): #print('src:%s'%(src_file)) #print('dst:%s'%(dst_file)) if os.path.splitext(src_file)[1] in vedio_exts: #视频 CreateThumb_vedio(src_file, '%s.jpg' % (dst_file), width, height) else: #图像 CreateThumb_img(src_file, dst_file, width, height) if i % 10 == 0: pbar.show_process(int(i * 100 / files_num)) pbar.show_process(100)
def MoveSameFilesTo(self, to_path): #检索相同文件 files_same = self.GetSameFiles() print('迁移相同文件:', to_path) pb = ProcessBar.ShowProcess(100, '', '', infoDone='Done') #拷贝相同文件 nGroups = len(files_same) src_path_len = len(self.path) for i, files_sub in enumerate(files_same): for sfile in files_sub[1:]: src_file = sfile dst_file = '%s%s' % (to_path, sfile[src_path_len:]) dst_path = os.path.split(dst_file)[0] if not os.path.exists(dst_path): os.makedirs(dst_path) print('src:', src_file) print('dst:', dst_file) shutil.move(src_file, dst_file) pb.show_process(int(i * 100) / nGroups) pb.show_process(100)
def Predicts(path): #收集图像文件 files = funs.GatherFiles(path, exts='.jpg,.jpeg,.png') pb = ProcessBar.ShowProcess(100) model = ResNet50(weights='imagenet') nFiles = len(files) for i, sfile in enumerate(files): img_path = sfile img = image.load_img(img_path, target_size=(224, 224)) if img == None: print('img.load_img(%s)=None' % (img_path)) continue x = image.img_to_array(img) x = np.expand_dims(x, axis=0) x = preprocess_input(x) preds = model.predict(x) # decode the results into a list of tuples (class, description, probability) # (one such list for each sample in the batch) print(sfile) print('Predicted:', decode_predictions(preds, top=3)[0]) # Predicted: [(u'n02504013', u'Indian_elephant', 0.82658225), (u'n01871265', u'tusker', 0.1122357), (u'n02504458', u'African_elephant', 0.061040461)] pb.show_process(int(i * 100 / nFiles)) pb.show_process(100)
def FindExtNotMatchFiles(path, exts, bFix=False): '''由于imghdr.what(sfile)返回的类型为空时,图像文件仍有效!本函数的比较作用失去意义! 建议:停用! ''' print('FindExtNotMatchFiles(%s,exts=%s,bFix=%d)' % (path, exts, bFix)) rets = [] print('GatherFiles(%s,exts=%s)' % (path, exts)) files = GatherFiles(path, exts) nFiles = len(files) pb = ProcessBar.ShowProcess(100, 'Not Match Search', '', 'OK') #不匹配查询 for i, sfile in enumerate(files): ext1 = os.path.splitext(sfile)[1] ext2 = imghdr.what(sfile) ext2 = '.' if ext2 == None else '.' + ext2 if ext1 != ext2 and not (ext1 == '.jpg' and ext2 == '.jpeg'): rets.append((sfile, ext2)) if bFix: new_file = '%s%s' % (os.path.splitext(sfile)[0], ext2) shutil.move(sfile, new_file) if i % 50 == 0: pb.show_process(int(i * 100 / nFiles)) pb.show_process(100) return rets
def __init__(self, path, exts=''): self.path = path #搜索目录 self.exts = exts #扩展名,如:".jpg,.png" self.pb = ProcessBar.ShowProcess(100, '', '', '') #进度条 self.info_path = funs.PathStat(path) #目录信息统计