def _upgrade_file(self, new_file, old_file): # print('Upgrading old file:', old_file.id, old_file.path, ' -> ', new_file.id, new_file.path) self._session.query(URL). \ filter(URL.file_id == old_file.id). \ update({URL.file_id: new_file.id}) file = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=old_file.path) if file.is_file(): file.delete_file()
def _dedupe(self): # unfinished = self._session\ # .query(File) \ # .options(joinedload(File.urls))\ # .filter(File.hash == None)\ # .filter(File.downloaded == True)\ # .all() start_time = datetime.now() hashed = set(int(r.file_id) for r in self._session.query(Hash.file_id) \ .filter(Hash.full_hash != None, Hash.file_id != None)) downloaded = set( r.id for r in self._session.query(File).filter(File.downloaded == True)) # get downloaded files without a hash search_ids = downloaded.difference(hashed).difference( self.dedup_ignore_ids) unfinished = self._session.query(File).filter( File.id.in_(search_ids)).all() unfinished = list( filter(lambda _f: not any(u.album_id for u in _f.urls), unfinished)) # Filter out albums. #print("Working on %s files total"%len(unfinished), debug=True) if not unfinished: return 0 stats = { 'unique': 0, 'has_dup': 0, 'special_hash': 0, 'not_is_file': 0, 'is_album': 0 } matches = [] last_printed = '' for idx, f in enumerate(unfinished): self.progress.set_status("Deduplicating %s of %s files..." % (idx + 1, len(unfinished))) #print("Working on %s/%s files"%(idx, len(unfinished)), debug=True) path = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=f.path) is_album = any(u.album_id for u in f.urls) if not path.is_file(): stats['not_is_file'] += 1 self.dedup_ignore_ids.add(f.id) continue if is_album: stats['is_album'] += 1 self.dedup_ignore_ids.add(f.id) continue if self._stop_event.is_set(): break new_hash = FileHasher.get_best_hash(path.absolute()) # print('New hash for File:', f.id, '::', new_hash) for h in self.special_hashes: if new_hash == h.full_hash: print("Found special hash:", h, "::\n", f, debug=True) stats['special_hash'] += 1 with self._lock: f.hash = Hash.make_hash(f, new_hash) self._session.query(URL).filter( URL.file_id == f.id).update( {URL.file_id: h.file_id}) file = SanitizedRelFile( base=settings.get("output.base_dir"), file_path=f.path) if file.is_file(): file.delete_file() self._session.commit() break else: # not a special hash matches = self._find_matching_files(new_hash, ignore_id=f.id) if matches: if new_hash == last_printed: print("Found another duplicate:", new_hash, "::\n", f, debug=True) elif len(matches) > 6: printed = matches[:3] + [ "... %s total matches ..." % len(matches) ] + matches[-3:] print("Found duplicate files: ", new_hash, "::\n", '\n'.join(str(m) for m in [f] + printed), debug=True) else: print("Found duplicate files: ", new_hash, "::\n", '\n'.join(str(m) for m in [f] + matches), debug=True) stats['has_dup'] += 1 last_printed = new_hash else: stats['unique'] += 1 # print('\tActual matches:', matches) with self._lock: f.hash = Hash.make_hash(f, new_hash) #print("Updating hash: ", f.id, f.hash.file_id, f.hash, debug=True) if len(matches): #print("Found duplicate files: ", new_hash, "::", [(m.id, m.path) for m in matches]) best, others = self._choose_best_file(matches + [f]) # print('Chose best File:', best.id) for o in others: self._upgrade_file(new_file=best, old_file=o) self._session.commit() if matches: print("Completed %s of %s files..." % (idx + 1, len(unfinished)), debug=True) dt = datetime.now() - start_time print("Completed all %s files in %s sec. Counts = %s" % (len(unfinished), str(dt), ', '.join( '%s: %s' % (k, v) for k, v in stats.items() if v)), debug=True) # self.prune_counter += len(matches) # if self.prune_counter >= 100: # self.prune_counter = 0 #self.progress.set_status("Pruning orphaned files...") #self._prune() #print("Finished pruning.", debug=True) return len(unfinished)