def removeallemptydir(path, verbose=False, removetrash=False): trashes=('^desktop\.ini$','^thumbs\.db$','^\.picasa\.ini$','^.*?\.thm$') result=True for f in os.listdir(path): fullname = os.path.join(path,f) if os.path.isfile(fullname): if not removetrash: result=False elif any(re.match(t,f,re.I|re.U) for t in trashes): try: os.chmod(fullname, stat.S_IRWXU| stat.S_IRWXG| stat.S_IRWXO) os.remove(fullname) if verbose: print GREEN+"del", fullname except Exception as e: gprint("Cannot remove trash file [<>]: <>", fullname, RED+str(e)) result = False else: result = removeallemptydir(fullname, verbose, removetrash) and result if result: os.chmod(path, stat.S_IRWXU| stat.S_IRWXG| stat.S_IRWXO) try: os.rmdir(path) if verbose: print GREEN+"rmdir", path except Exception as e: gprint("Cannot rmdir [<>]: <>\n", path, str(e)) return result
def printme(self): lastp=u'' for t,fns in self.result.iteritems(): print "\n\n", RED+t, len(fns) for fullpath in fns: p,fn= os.path.split(fullpath) if p!=lastp: print p lastp=p print "\t", fn for t,fns in self.result.iteritems(): gprint("\n\n<> can be guessed as:",t) for p in sorted(set(os.path.split(fn)[0] for fn in fns)): dt0,dt1,guess=self.pathdts[p][0],self.pathdts[p][1],self.guessdts[p] c0,c1,color = (guess is None) and ("","",RED) or\ (dt0 is None) and ("","",YELLOW) or\ {(True,True):("","",GREEN), (False,False):("","",RED), (True,False):("",RED,""), (False,True):(RED,"",""), }[(abs( (dt0-guess).days )<90, abs( (dt1-guess).days )<90)] gprint('\n\t<>\n\t\t[<>, <>] <>\n', p, (c0 + dt0.strftime("%Y-%m%d")) if dt0 else '', (c1 + dt1.strftime("%Y-%m%d")) if dt1 else '', color + (guess.strftime("%Y-%m%d") if guess else "GUESSFAILED") ) c=sum([len(fns) for (k,fns) in self.result.iteritems() if k !='dtmayerror']) print RED+"{} files cannot be archived.".format(c) if c else GREEN+"All files can be archived." if self.datetimegener is not None: print GREEN + "After gen, You can check again."
def walkfiles(path, condition=lambda x: True, debug=False,pre=""): for root, dirs, files in os.walk(path): myfiles = filter(condition, files) c = len(myfiles) if debug and c: gprint("<><> [<>]\n",pre, root, BLUE+str(c) ) for f in myfiles: yield os.path.join(root, f)
def adddir(self, path, rootdir): gprint("\nAdding [<>] to archive\n", BLUE+BWHITE+path) fns= [f for f in gpylib.misc.walkfiles(path, _WALK_CONDITION, True,"\n\n")] gprint(RED+"Randoming the files\n") random.shuffle(fns) for fn in fns: #for fn in gpylib.misc.walkfiles(path, _WALK_CONDITION, True,"\n\n"): try: self.add(fn, rootdir) except PicException as e: self.failed.append(fn ) print "\n"
def __call__(self, fn): md5 = self.cache.get(fn,None) if md5 is None: gprint(YELLOW+"5\b") h = hashlib.new('md5') with open(fn,'rb') as f: #如果不用rb会出现重复的概率。 h.update(f.read()) md5 = h.hexdigest() self.cache[fn]=md5 else: reachcount +=1 return md5
def grouppicdir(dir, dst, best=True, dstw=2970, dsth=2100, margin=20): """拼接某个目录下的jpg """ gprint(u"组合照片 [<>] [<>]\n\n", dir, GREEN+"Best" if best else "") (dd, df) = os.path.split(dst) if not df.startswith("_"): print "[%s] must start with _"%df return files = [os.path.join(dir, f) for f in os.listdir(dir) if re.match(r"[^_].+\.jpg",f,re.I) != None] grouppic(files, dst, dstw, dsth, margin, best)
def selftest(self, verbose=False): lastpg=None errorcount=0 md5 = hashlib.new('md5') print BLUE+BWHITE+"\n\nSelf Testing" for pg in self.groups: if verbose: gprint("[<>] - [<>] : <> Files.\n", pg.pics[0].datetime, pg.pics[-1].datetime, BLUE+"{:03d}".format(len(pg.pics)) ) if lastpg is not None and lastpg.pics[-1].cmptime(pg.pics[0])>=0: errorcount+=1 gprint("<>\n\t<>\n\t<>\n", RED+"Error Found: Group split", lastpg.path, pg.path) lastf=None for f in pg.pics: md5.update(f.easyhash.encode("gb2312")) if lastf is not None and lastf.cmptime(f)>0: errorcount+=1 gprint("<>\n\t<>\n\t<>\n", RED+"Error Found: File Order", lastf.fullname, f.fullname) lastf=f lastpg=pg gprint("\nCheck end, [<>] errors found.\nDigest[<>]\n", (RED if errorcount else GREEN) + str(errorcount), BLUE+md5.hexdigest() )
def dump(self,newdir,fake,movenewfile): newfullname = os.path.join(newdir,self.name) if newfullname.lower()==self.fullname.lower(): return "File No Action" '''gprint("<>\n\t<>\n\t<>\n", "move" if (movenewfile or self.rootdir is None) else "copy", self.fullname,newfullname) ''' (out,action,ret) = (self.rootdir is None) and (GREEN+"M", shutil.move, "Move Archive") or \ movenewfile and ( RED+"M", shutil.move, "Move New") or \ ( "C", shutil.copy2, "Copy New") gprint(out) if not fake: action(self.fullname, newfullname) self.fullname,self.rootdir=newfullname, None return ret
def printdumpinfo(self): print BLUE+BWHITE+"Dump information\n" print "Old", BLUE+str(len(self.old)) print u"".join([u"{}\n\t{}\n\n".format(new.fullname, old.fullname) for new,old in self.old]) print "\n", YELLOW+"Overwrite", BLUE+str(len(self.overwrite)) print u"".join([u"{}\n\t{}\n\n".format(new.fullname, old.fullname) for new,old in self.overwrite]) print "\n", RED+"Failed", len(self.failed) print u"\n".join(self.failed) print "Actions:{" gpylib.misc.ppdict(self._dumpactions,prekey=u" "*4, prevalue=u" "*8) print "}" gprint("\nTotal file [<>], readsize [<>], readmd5 [<>]\n", _PicFile.totalfile, _PicFile.totalsize, _PicFile.totalmd5) gprint("MD5 Cache Size [<>], reachcount [<>]\n", *gpylib.misc.getmd5.status )
def dumpstep1(self, fake, movenewfile, actions): if self.clean or self.count()==0: actions['Dir No Action'] +=1 return print "\n\n", if self.path is None: #还没有创建过 self.path=os.path.join(self.basepath, self.pics[0].datetime.replace(':',"-")) actions['MkDir'] +=1 print "mkdir",self.path if not fake: os.mkdir(self.path) else: print self.path gprint("[<>] Files\n", len(self.pics)) for p in self.pics: actions[p.dump(self.path,fake,movenewfile)] +=1
def add(self, fn, rootdir): pf = _PicFile(fn,rootdir) samefile = self.findsamefile(pf) #根据MD5去找 if samefile: pf.exif=samefile.exif #更新其日期而已这样后面才能找到 if len(self.groups)==0: self.groups.append(_PicGroup.newone(self.path,pf)) return for g in self.groups: if pf<=g: #如果pf比第一个组还要小,那么就加入到第一个组了 #如果pf在两个组中间,则添加到后一个组里面 #如果pf比任何一个组都要大,则添加到最后一个组 break re, refpf = g.add(pf) if re=="Old": gprint(DIM+"O") self.old.append( (pf,refpf) ) elif re=="Overwrite": gprint(YELLOW+"W") self.overwrite.append( (pf,refpf) ) else: #'New' self.md5dict[pf.size].append( pf ) gprint(GREEN+"N") if(g.count()>=self.max): ng=g.split() if ng is not None: bisect.insort_left(self.groups,ng)
def ppdict(d,prekey=u"",postkey=u"", prevalue=u"\t",postvalue=u""): for k,v in d.iteritems(): gprint(u"<><><>\n",prekey,k,postkey) if isinstance(v,list): for i in v: gprint(u"<><><>\n",prevalue, i, postvalue) else: gprint(u"<><><>\n",prevalue,v, postvalue)
def selftest(self, verbose=False): lastpg=None errorcount=0 print BLUE+BWHITE+"\n\nSelf Testing" for pg in self.groups: if verbose: gprint("[<>] - [<>] : <> Pics\n", pg.pics[0].datetime, pg.pics[-1].datetime, BLUE+"{:03d}".format(len(pg.pics)) ) if lastpg is not None and lastpg.pics[-1].cmptime(pg.pics[0])>=0: errorcount+=1 gprint("<>\n\t<>\n\t<>\n", RED+"Error Found: Group split", lastpg.path, pg.path) lastf=None for f in pg.pics: if lastf is not None and lastf.cmptime(f)>0: errorcount+=1 gprint("<>\n\t<>\n\t<>\n", RED+"Error Found: File Order", lastf.fullname, f.fullname) lastf=f lastpg=pg gprint("\nCheck end, [<>] errors found.\n", (RED if errorcount else GREEN) + str(errorcount) )
def findsamefile(*dirlist): gprint(u"Checking the following directories:\n<>\n", u"".join(u"\t[{}]\n".format(d) for d in dirlist)) sizedict, fc = defaultdict(list), 0 for fn in itertools.chain( *(gpylib.misc.walkfiles(d, lambda x: True, True) for d in dirlist) ): sizedict[os.stat(fn).st_size].append( fn ) fc += 1 md5dict, mc = defaultdict(list), 0 for f in itertools.chain( *(filter(lambda x: len(x)>1, sizedict.itervalues())) ): gpylib.misc.printworking() md5dict[gpylib.misc.getmd5(f)].append(f) mc+=1 samefilelist = filter(lambda x: len(x)>1, md5dict.itervalues()) gprint("\nTotal [<>] duplicate files.\n", BLUE + str(len(samefilelist)) ) print u"\n\n\n".join("\n".join(fns) for fns in sorted(samefilelist) ) gprint("Total [<>] files, calc md5 [<>].",BLUE+str(fc), RED+str(mc) )
def printse(self): gprint("\nGroup [<>]\n\t", BLUE+str(len(self.pics))) print "\n\t".join(pf.name for pf in [self.pics[0],self.pics[-1]])
def printdebug(self, step=50): gprint("\nGroup [<>]\n\t", BLUE+str(len(self.pics))) print "\n\t".join(pf.name for pf in self.pics[::step])
def printonlysi(self): gprint("\nGroup [<>]\n\t", BLUE+str(len(self.pics))) print "\n\t".join([u"{}\n\t\t{}\n\t\t{}".format(pf.name,pf.fullname,pf._pre.fullname) for pf in self.pics \ if re.match(ur"^N{0,1}[SI]+$",pf.nameflag,re.U)!=None])
def printme(self): print "Total ",self.count print "\n", RED+"No Exif", len(self.noexifs) print u"\n".join(self.noexifs) print "\n", RED+"No MMDT", len(self.nommdts) print u"\n".join(self.nommdts) print "\n", RED+"No MM", len(self.nomms) print u"\n".join([u"{}\n\t{}".format(*item) for item in self.nomms]) print "\n", RED+"Zero DT", len(self.zeordts) print u"\n".join(self.zeordts) print "\n", RED+"MM No DateTime", len(self.mmnodts) print "\n".join([u"{}\n\t[{}]".format(f,d) for f,d in self.mmnodts]) print "\n", BLUE+"Date Time", len(self.ymds) print "\n".join(["{}\t{}".format(d,c) for (d,c) in sorted(self.ymds.items())]) print "\n", BLUE+"Make Models", len(self.models) print "\n".join(["{}\t\t{}".format(m,c) for (m,c) in self.models.iteritems()]) print "\n", RED+"No MM Paths", len(self.nomms) print u"\n".join(sorted(set(os.path.split(fn)[0] for (fn,dt) in self.nomms))) print "\n", RED+"Zero DT Paths", len(self.zeordts) print u"\n".join(sorted(set(os.path.split(fn)[0] for fn in self.zeordts))) #相同size、相同MD5 但是Datatime不同的,这样备份后会重复。 #相同size、相同Datetime,不同MD5的,这样比较Funny #三个都相同的,会在备份的时候自然只会选择一个 def THOSE_VALUE_MORE_THAN_ONE(d): return filter(lambda x: len(x[1])>1, d.iteritems()) for (size,fndts) in THOSE_VALUE_MORE_THAN_ONE(self.sizes): dt_md5s = defaultdict(lambda: defaultdict(list)) md5_dts = defaultdict(lambda: defaultdict(list)) dtmd5_fns = defaultdict(list) for (fn,dt) in fndts: md5=gpylib.misc.getmd5(fn) dt_md5s[dt][md5].append(fn) md5_dts[md5][dt].append(fn) dtmd5_fns[(dt,md5)].append(fn) for (dt,md5s) in THOSE_VALUE_MORE_THAN_ONE(dt_md5s): #Funny: size,datetime相同,但是MD5不同 gprint("\n<> [<>] [<>]\n",BLUE+"Funny",size, dt) gpylib.misc.ppdict(md5s,u" [",u"]", u" ",u"") #这个item是 key/list for (md5, dts) in THOSE_VALUE_MORE_THAN_ONE( md5_dts): #重复: size,MD5相同,但是DateTime不同 gprint("\n<> [<>] [<>]\n",YELLOW+"Duplicate",size, md5) gpylib.misc.ppdict(dts,u" [",u"]", u" ",u"") for ((dt,md5),fns) in THOSE_VALUE_MORE_THAN_ONE( dtmd5_fns): #有size,md5,DateTime都相同的情况 gprint("\n<> [<>] [<>] [<>]\n ", GREEN+"Same File", size, dt, md5) print u"\n ".join(fns)
def testarchiveiflost(newdir,archivedir, verbose, del_ifok=False): gprint("Check [<>] in [<>], verbose[<>], del_ifok[<>]\n\n", newdir, archivedir, verbose, del_ifok) print BLUE+BWHITE+"Indexing archive dir.." verboselist=[] archivedict=defaultdict(list) for fn in gpylib.misc.walkfiles(archivedir, _WALK_CONDITION, True): archivedict[os.stat(fn).st_size].append( [fn,None] ) print BLUE+"\nChecking..." lostcount, removecount=0,0 for fn in gpylib.misc.walkfiles(newdir, _WALK_CONDITION, True,pre="\n"): size=os.stat(fn).st_size fnmd5s = archivedict.get(size,None) if fnmd5s is None: gprint("<>\n\t<>\n",RED+"Lost:",fn) lostcount +=1 continue md5=gpylib.misc.getmd5(fn) for fnmd5 in fnmd5s: if fnmd5[1] is None: fnmd5[1]=gpylib.misc.getmd5(fnmd5[0]) if fnmd5[1] == md5: if verbose == '1': gprint("\n<>\n\t<>\n", fn, fnmd5[0]) elif verbose != "0" and verbose is not False: verboselist.append( (fn,fnmd5[0]) ) if del_ifok == "del_ifok": os.chmod(fn, stat.S_IRWXU| stat.S_IRWXG| stat.S_IRWXO) os.remove( fn ) gprint(RED+"D") removecount +=1 break else: gprint("<>\n\t<>\n",RED+"Lost:",fn) lostcount +=1 gprint("\nCheck ended.\n\t[<>] files lost.\n\t[<>]files deleted.", YELLOW+str(lostcount), RED+str(removecount)) if len(verboselist)!=0: with open(verbose,"w") as f: f.writelines("\n{}\n\t{}\n".format(*a) for a in verboselist)
def printworking(current=[0]): chars=[RED+'-',GREEN+'\\', BLUE+"|", YELLOW+'/'] gprint("<><>", chars[current[0]],"\b") current[0]=(current[0]+1)%4