Ejemplo n.º 1
0
 def _upgrade_file(self, new_file, old_file):
     # print('Upgrading old file:', old_file.id, old_file.path, ' -> ', new_file.id, new_file.path)
     self._session.query(URL). \
      filter(URL.file_id == old_file.id). \
      update({URL.file_id: new_file.id})
     file = SanitizedRelFile(base=settings.get("output.base_dir"),
                             file_path=old_file.path)
     if file.is_file():
         file.delete_file()
Ejemplo n.º 2
0
    def _dedupe(self):
        # unfinished = self._session\
        # 	.query(File) \
        # 	.options(joinedload(File.urls))\
        # 	.filter(File.hash == None)\
        # 	.filter(File.downloaded == True)\
        # 	.all()
        start_time = datetime.now()

        hashed = set(int(r.file_id) for r in self._session.query(Hash.file_id) \
                     .filter(Hash.full_hash != None, Hash.file_id != None))
        downloaded = set(
            r.id
            for r in self._session.query(File).filter(File.downloaded == True))
        # get downloaded files without a hash
        search_ids = downloaded.difference(hashed).difference(
            self.dedup_ignore_ids)
        unfinished = self._session.query(File).filter(
            File.id.in_(search_ids)).all()

        unfinished = list(
            filter(lambda _f: not any(u.album_id for u in _f.urls),
                   unfinished))  # Filter out albums.

        #print("Working on %s files total"%len(unfinished), debug=True)

        if not unfinished:
            return 0

        stats = {
            'unique': 0,
            'has_dup': 0,
            'special_hash': 0,
            'not_is_file': 0,
            'is_album': 0
        }
        matches = []
        last_printed = ''
        for idx, f in enumerate(unfinished):
            self.progress.set_status("Deduplicating %s of %s files..." %
                                     (idx + 1, len(unfinished)))
            #print("Working on  %s/%s files"%(idx, len(unfinished)), debug=True)
            path = SanitizedRelFile(base=settings.get("output.base_dir"),
                                    file_path=f.path)
            is_album = any(u.album_id for u in f.urls)
            if not path.is_file():
                stats['not_is_file'] += 1
                self.dedup_ignore_ids.add(f.id)
                continue
            if is_album:
                stats['is_album'] += 1
                self.dedup_ignore_ids.add(f.id)
                continue
            if self._stop_event.is_set():
                break
            new_hash = FileHasher.get_best_hash(path.absolute())
            # print('New hash for File:', f.id, '::', new_hash)
            for h in self.special_hashes:
                if new_hash == h.full_hash:
                    print("Found special hash:", h, "::\n", f, debug=True)
                    stats['special_hash'] += 1
                    with self._lock:
                        f.hash = Hash.make_hash(f, new_hash)
                        self._session.query(URL).filter(
                            URL.file_id == f.id).update(
                                {URL.file_id: h.file_id})
                        file = SanitizedRelFile(
                            base=settings.get("output.base_dir"),
                            file_path=f.path)
                        if file.is_file():
                            file.delete_file()
                        self._session.commit()
                        break
            else:  # not a special hash
                matches = self._find_matching_files(new_hash, ignore_id=f.id)
                if matches:
                    if new_hash == last_printed:
                        print("Found another duplicate:",
                              new_hash,
                              "::\n",
                              f,
                              debug=True)
                    elif len(matches) > 6:
                        printed = matches[:3] + [
                            "... %s total matches ..." % len(matches)
                        ] + matches[-3:]
                        print("Found duplicate files: ",
                              new_hash,
                              "::\n",
                              '\n'.join(str(m) for m in [f] + printed),
                              debug=True)
                    else:
                        print("Found duplicate files: ",
                              new_hash,
                              "::\n",
                              '\n'.join(str(m) for m in [f] + matches),
                              debug=True)
                    stats['has_dup'] += 1
                    last_printed = new_hash
                else:
                    stats['unique'] += 1
                # print('\tActual matches:', matches)
                with self._lock:
                    f.hash = Hash.make_hash(f, new_hash)
                    #print("Updating hash: ", f.id, f.hash.file_id, f.hash, debug=True)
                    if len(matches):
                        #print("Found duplicate files: ", new_hash, "::", [(m.id, m.path) for m in matches])
                        best, others = self._choose_best_file(matches + [f])
                        # print('Chose best File:', best.id)
                        for o in others:
                            self._upgrade_file(new_file=best, old_file=o)
                    self._session.commit()
                if matches:
                    print("Completed %s of %s files..." %
                          (idx + 1, len(unfinished)),
                          debug=True)
        dt = datetime.now() - start_time
        print("Completed all %s files in %s sec. Counts = %s" %
              (len(unfinished), str(dt), ', '.join(
                  '%s: %s' % (k, v) for k, v in stats.items() if v)),
              debug=True)
        # self.prune_counter += len(matches)
        # if self.prune_counter >= 100:
        # 	self.prune_counter = 0
        #self.progress.set_status("Pruning orphaned files...")
        #self._prune()
        #print("Finished pruning.", debug=True)
        return len(unfinished)