Ejemplo n.º 1
0
class Deduplicator(multiprocessing.Process):
    def __init__(self, settings_json, stop_event, db_lock):
        """
		Create a Hasher Process, which will be bound to the stop_event, performing post-processing on downloaded Files.
		"""
        super().__init__()
        self._settings = settings_json
        self._stop_event = stop_event
        self._lock = db_lock
        self.progress = DownloaderProgress()
        self.progress.clear(status="Starting up...")
        self._session = None
        self.daemon = True

    def run(self):
        """ Threaded loading of elements. """
        settings.from_json(self._settings)
        sql.init_from_settings()
        try:
            self._session = sql.session()
            self.progress.clear(status="Starting up...")
            self.progress.set_running(True)

            while not self._stop_event.is_set():
                self._dedupe()
                self.progress.set_status("Ready for new files...")
                self._stop_event.wait(2)
            self._dedupe()  # Run one final pass after downloading stops.
            self.progress.clear(status="Finished.", running=False)
        except Exception as ex:
            print('Deduplication Process Error:', ex)
            self.progress.set_error(ex)
            self.progress.set_running(False)
            traceback.print_exc()
        finally:
            sql.close()

    def _dedupe(self):
        unfinished = self._session\
         .query(File) \
         .options(joinedload(File.urls))\
         .filter(File.hash == None)\
         .filter(File.downloaded == True)\
         .all()

        unfinished = list(
            filter(lambda _f: not any(u.album_id for u in _f.urls),
                   unfinished))  # Filter out albums.

        if not unfinished:
            return

        for idx, f in enumerate(unfinished):
            self.progress.set_status("Deduplicating (%s) files..." %
                                     (len(unfinished) - idx))
            path = SanitizedRelFile(base=settings.get("output.base_dir"),
                                    file_path=f.path)
            is_album = any(u.album_id for u in f.urls)
            if not path.is_file() or is_album:
                continue
            new_hash = FileHasher.get_best_hash(path.absolute())
            # print('New hash for File:', f.id, '::', new_hash)
            matches = self._find_matching_files(new_hash, ignore_id=f.id)
            # print('\tActual matches:', matches)
            with self._lock:
                f.hash = Hash.make_hash(f, new_hash)
                if len(matches):
                    # print("Found duplicate files: ", new_hash, "::", [(m.id, m.path) for m in matches])
                    best, others = self._choose_best_file(matches + [f])
                    # print('Chose best File:', best.id)
                    for o in others:
                        self._upgrade_file(new_file=best, old_file=o)
                self._session.commit()
        self._prune()

    def _find_matching_files(self, search_hash, ignore_id):
        sp = Hash.split_hash(search_hash)
        all_hashes = self._session \
         .query(File) \
         .join(Hash, File.hash) \
         .filter(
          (Hash.full_hash == search_hash) |
          (Hash.p1 == sp[0]) |
          (Hash.p2 == sp[1]) |
          (Hash.p3 == sp[2]) |
          (Hash.p4 == sp[3])
         ).all()
        # print(sp)
        # print('Potential matches:', len(all_hashes), all_hashes)
        return list(
            filter(lambda f: self._check_hash_match(f, search_hash),
                   all_hashes))

    def _check_hash_match(self, file, search_hash):
        """ Compare the given hash against the given SQL File.
			Returns invalid if the target File has albums, or is not fully processed.
		"""
        if not file.hash or any(u.album_id or not u.processed
                                for u in file.urls):
            return False
        if FileHasher.hamming_distance(search_hash, file.hash.full_hash) >= 4:
            return False
        return True

    def _choose_best_file(self, files):
        files = sorted(
            files,
            key=lambda f: SanitizedRelFile(
                base=settings.get("output.base_dir"), file_path=f.path).size(),
            reverse=True)
        return files[0], files[1:]

    def _upgrade_file(self, new_file, old_file):
        # print('Upgrading old file:', old_file.id, old_file.path, ' -> ', new_file.id, new_file.path)
        self._session.query(URL). \
         filter(URL.file_id == old_file.id). \
         update({URL.file_id: new_file.id})
        file = SanitizedRelFile(base=settings.get("output.base_dir"),
                                file_path=old_file.path)
        if file.is_file():
            file.delete_file()

    def _prune(self):
        with self._lock:
            orphans = self._session.query(File).filter(
                ~File.urls.any()).delete(synchronize_session='fetch')
            self._session.commit()
Ejemplo n.º 2
0
class Downloader(multiprocessing.Process):
    def __init__(self, reader, ack_queue, settings_json, db_lock):
        """
		Create a Downloader Process, which will be bound to the queue given, listening for URLs to download.
		"""
        super().__init__()
        self._reader = reader
        self._settings = settings_json
        self.progress = DownloaderProgress()
        self._session = None
        self._db_lock = db_lock
        self._ack_queue = ack_queue
        self.daemon = True

    def run(self):
        """ Threaded loading of elements. """
        settings.from_json(self._settings)
        sql.init_from_settings()
        self._session = sql.session()
        self.progress.clear(status="Starting up...", running=True)
        failed = False

        for nxt_id in self._reader:
            try:
                url = self._session.query(
                    sql.URL).filter(sql.URL.id == nxt_id).first()
                if not url:
                    raise Exception("Unknown URL ID provided: (%s}" % nxt_id)

                file = url.file
                path = SanitizedRelFile(base=settings.get("output.base_dir"),
                                        file_path=str(file.path))

                self.progress.set_file(path.relative())
                self.progress.set_status("Attempting to Handle URL...")
                self.progress.set_running(True)

                task = handlers.HandlerTask(url=url.address, file_obj=path)
                resp = handlers.handle(task, self.progress)

                is_album_parent = False

                with self._db_lock:
                    if resp.album_urls:
                        if url.album_id:
                            resp.album_urls = [
                            ]  # Ignore nested Albums to avoid recursion.
                        else:
                            url.album_id = str(uuid.uuid4())
                            is_album_parent = True
                    else:
                        resp.album_urls = []

                    url.failed = not resp.success
                    url.failure_reason = resp.failure_reason
                    url.last_handler = resp.handler
                    url.album_is_parent = is_album_parent

                    if resp.rel_file:
                        file.downloaded = True
                        file.path = resp.rel_file.relative()
                        file.hash = None
                        utime(resp.rel_file.absolute(), times=(time(), time()))

                    self._session.commit()

                # Once *all* processing is completed on this URL, the Downloader needs to ACK it.
                # If any additional Album URLS were located, they should be sent before the ACK.
                self._ack_queue.put(
                    AckPacket(url_id=nxt_id, extra_urls=resp.album_urls))
                self.progress.clear(status="Waiting for URL...")
            except Exception as ex:
                failed = str(ex)
                self._ack_queue.put(AckPacket(url_id=nxt_id, extra_urls=[]))
                print(ex)
                traceback.print_exc()
                self.progress.set_error("Exited with error: {%s}" % failed)
                break

        sql.close()
        self.progress.clear(
            "Finished." if not failed else "Exited with error: %s" % failed,
            running=False)
class Deduplicator(multiprocessing.Process):
    def __init__(self, settings_json, stop_event):
        """
		Create a Hasher Process, which will be bound to the stop_event, performing post-processing on downloaded Files.
		"""
        super().__init__()
        self._settings = settings_json
        self._stop_event = stop_event
        self.progress = DownloaderProgress()
        self.progress.clear(status="Starting up...")
        self._session = None
        self.daemon = True

    def run(self):
        """ Threaded loading of elements. """
        settings.from_json(self._settings)
        sql.init_from_settings()
        self._session = sql.session()
        self.progress.clear(status="Starting up...")
        self.progress.set_running(True)

        while not self._stop_event.is_set():
            self._dedupe()
            self.progress.set_status("Waiting for new files...")
            self._stop_event.wait(2)
        self._dedupe()  # Run one final pass after downloading stops.

        self.progress.set_running(False)
        sql.close()
        self.progress.clear("Finished.")

    def _dedupe(self):
        unfinished = self._session\
         .query(File) \
         .options(joinedload(File.urls))\
         .filter(File.hash == None)\
         .filter(File.downloaded == True)\
         .all()

        unfinished = list(
            filter(lambda _f: not any(u.album_id for u in _f.urls),
                   unfinished))  # Filter out albums.

        if not unfinished:
            return

        for idx, f in enumerate(unfinished):
            self.progress.set_status("Deduplicating (%s) files..." %
                                     (len(unfinished) - idx))
            path = SanitizedRelFile(base=settings.get("output.base_dir"),
                                    file_path=f.path)
            is_album = any(u.album_id for u in f.urls)
            if not path.is_file() or is_album:
                continue
            new_hash = FileHasher.get_best_hash(path.absolute())
            # print('New hash for File:', f.id, '::', new_hash)
            matches = [] if is_album else self._find_matching_files(
                new_hash, ignore_id=f.id)
            f.hash = new_hash
            if len(matches):
                print("Found duplicate files: ", new_hash, "::",
                      [(m.id, m.path) for m in matches])
                best, others = self._choose_best_file(matches + [f])
                print('Chose best File:', best.id)
                for o in others:
                    self._upgrade_file(new_file=best, old_file=o)
            self._session.commit()
        self._prune()

    def _find_matching_files(self, search_hash, ignore_id):
        all_hashes = self._session \
         .query(File) \
         .options(joinedload(File.urls)) \
         .filter(File.hash != None) \
         .filter(File.downloaded == True) \
         .filter(File.id != ignore_id) \
         .all()
        matches = []
        for pm in all_hashes:
            if any(u.album_id for u in pm.urls) or any(not u.processed
                                                       for u in pm.urls):
                continue
            if FileHasher.hamming_distance(search_hash, pm.hash) < 4:
                matches.append(pm)
        return matches

    def _choose_best_file(self, files):
        files = sorted(
            files,
            key=lambda f: SanitizedRelFile(
                base=settings.get("output.base_dir"), file_path=f.path).size(),
            reverse=True)
        return files[0], files[1:]

    def _upgrade_file(self, new_file, old_file):
        print('Upgrading old file:', old_file.id, old_file.path, ' -> ',
              new_file.id, new_file.path)
        self._session.query(URL). \
         filter(URL.file_id == old_file.id). \
         update({URL.file_id: new_file.id})
        file = SanitizedRelFile(base=settings.get("output.base_dir"),
                                file_path=old_file.path)
        if file.is_file():
            file.delete_file()

    def _prune(self):
        orphans = self._session.query(File).filter(~File.urls.any()).delete(
            synchronize_session='fetch')
        self._session.commit()
        if orphans:
            print("Deleted orphan Files:", orphans)
Ejemplo n.º 4
0
class Deduplicator(multiprocessing.Process):
    def __init__(self, settings_json, stop_event, db_lock):
        """
		Create a Hasher Process, which will be bound to the stop_event, performing post-processing on downloaded Files.
		"""
        super().__init__()
        self._settings = settings_json
        self._stop_event = stop_event
        self._lock = db_lock
        self.progress = DownloaderProgress()
        self.progress.clear(status="Starting up...")
        self._session = None
        self.daemon = True

    def run(self):
        """ Threaded loading of elements. """
        settings.from_json(self._settings)
        sql.init_from_settings()
        print("Starting up...", debug=True)
        try:
            self._session = sql.session()
            self.progress.clear(status="Starting up...")
            self.progress.set_running(True)
            self.dedup_ignore_ids = set()
            self.prune_counter = 0
            self.special_hashes = self._session.query(Hash).filter(
                Hash.id < 0).all()

            while not self._stop_event.is_set():
                #print("_stop_event is %s"%self._stop_event.is_set(), debug=True)
                completed = self._dedupe()
                if completed:
                    self.progress.set_status(
                        "Completed %s files. Ready for new files..." %
                        completed)
                    self._stop_event.wait(1)
                else:
                    self._stop_event.wait(10)
            print("_stop_event is %s" % self._stop_event.is_set(), debug=True)
            self._dedupe()  # Run one final pass after downloading stops.
            self.progress.clear(status="Finished.", running=False)
        except Exception as ex:
            print('Deduplication Process Error:', ex)
            self.progress.set_error(ex)
            self.progress.set_running(False)
            traceback.print_exc()
        finally:
            print("Finished process, _stop_event is %s" %
                  self._stop_event.is_set(),
                  debug=True)
            sql.close()

    def _dedupe(self):
        # unfinished = self._session\
        # 	.query(File) \
        # 	.options(joinedload(File.urls))\
        # 	.filter(File.hash == None)\
        # 	.filter(File.downloaded == True)\
        # 	.all()
        start_time = datetime.now()

        hashed = set(int(r.file_id) for r in self._session.query(Hash.file_id) \
                     .filter(Hash.full_hash != None, Hash.file_id != None))
        downloaded = set(
            r.id
            for r in self._session.query(File).filter(File.downloaded == True))
        # get downloaded files without a hash
        search_ids = downloaded.difference(hashed).difference(
            self.dedup_ignore_ids)
        unfinished = self._session.query(File).filter(
            File.id.in_(search_ids)).all()

        unfinished = list(
            filter(lambda _f: not any(u.album_id for u in _f.urls),
                   unfinished))  # Filter out albums.

        #print("Working on %s files total"%len(unfinished), debug=True)

        if not unfinished:
            return 0

        stats = {
            'unique': 0,
            'has_dup': 0,
            'special_hash': 0,
            'not_is_file': 0,
            'is_album': 0
        }
        matches = []
        last_printed = ''
        for idx, f in enumerate(unfinished):
            self.progress.set_status("Deduplicating %s of %s files..." %
                                     (idx + 1, len(unfinished)))
            #print("Working on  %s/%s files"%(idx, len(unfinished)), debug=True)
            path = SanitizedRelFile(base=settings.get("output.base_dir"),
                                    file_path=f.path)
            is_album = any(u.album_id for u in f.urls)
            if not path.is_file():
                stats['not_is_file'] += 1
                self.dedup_ignore_ids.add(f.id)
                continue
            if is_album:
                stats['is_album'] += 1
                self.dedup_ignore_ids.add(f.id)
                continue
            if self._stop_event.is_set():
                break
            new_hash = FileHasher.get_best_hash(path.absolute())
            # print('New hash for File:', f.id, '::', new_hash)
            for h in self.special_hashes:
                if new_hash == h.full_hash:
                    print("Found special hash:", h, "::\n", f, debug=True)
                    stats['special_hash'] += 1
                    with self._lock:
                        f.hash = Hash.make_hash(f, new_hash)
                        self._session.query(URL).filter(
                            URL.file_id == f.id).update(
                                {URL.file_id: h.file_id})
                        file = SanitizedRelFile(
                            base=settings.get("output.base_dir"),
                            file_path=f.path)
                        if file.is_file():
                            file.delete_file()
                        self._session.commit()
                        break
            else:  # not a special hash
                matches = self._find_matching_files(new_hash, ignore_id=f.id)
                if matches:
                    if new_hash == last_printed:
                        print("Found another duplicate:",
                              new_hash,
                              "::\n",
                              f,
                              debug=True)
                    elif len(matches) > 6:
                        printed = matches[:3] + [
                            "... %s total matches ..." % len(matches)
                        ] + matches[-3:]
                        print("Found duplicate files: ",
                              new_hash,
                              "::\n",
                              '\n'.join(str(m) for m in [f] + printed),
                              debug=True)
                    else:
                        print("Found duplicate files: ",
                              new_hash,
                              "::\n",
                              '\n'.join(str(m) for m in [f] + matches),
                              debug=True)
                    stats['has_dup'] += 1
                    last_printed = new_hash
                else:
                    stats['unique'] += 1
                # print('\tActual matches:', matches)
                with self._lock:
                    f.hash = Hash.make_hash(f, new_hash)
                    #print("Updating hash: ", f.id, f.hash.file_id, f.hash, debug=True)
                    if len(matches):
                        #print("Found duplicate files: ", new_hash, "::", [(m.id, m.path) for m in matches])
                        best, others = self._choose_best_file(matches + [f])
                        # print('Chose best File:', best.id)
                        for o in others:
                            self._upgrade_file(new_file=best, old_file=o)
                    self._session.commit()
                if matches:
                    print("Completed %s of %s files..." %
                          (idx + 1, len(unfinished)),
                          debug=True)
        dt = datetime.now() - start_time
        print("Completed all %s files in %s sec. Counts = %s" %
              (len(unfinished), str(dt), ', '.join(
                  '%s: %s' % (k, v) for k, v in stats.items() if v)),
              debug=True)
        # self.prune_counter += len(matches)
        # if self.prune_counter >= 100:
        # 	self.prune_counter = 0
        #self.progress.set_status("Pruning orphaned files...")
        #self._prune()
        #print("Finished pruning.", debug=True)
        return len(unfinished)

    def _find_matching_files(self, search_hash, ignore_id):
        sp = Hash.split_hash(search_hash)
        all_hashes = self._session \
         .query(File) \
         .join(Hash, File.hash) \
         .filter(
          (Hash.full_hash == search_hash) |
          (Hash.p1 == sp[0]) |
          (Hash.p2 == sp[1]) |
          (Hash.p3 == sp[2]) |
          (Hash.p4 == sp[3])
         ).all()
        # print(sp)
        # print('Potential matches:', len(all_hashes), all_hashes)
        return list(
            filter(lambda f: self._check_hash_match(f, search_hash),
                   all_hashes))

    def _check_hash_match(self, file, search_hash):
        """ Compare the given hash against the given SQL File.
			Returns invalid if the target File has albums, or is not fully processed.
		"""
        if not file.hash or any(u.album_id or not u.processed
                                for u in file.urls):
            return False
        #if FileHasher.hamming_distance(search_hash, file.hash.full_hash) >= 4:
        if search_hash != file.hash.full_hash:
            return False
        return True

    def _choose_best_file(self, files):
        files = sorted(
            files,
            key=lambda f: SanitizedRelFile(
                base=settings.get("output.base_dir"), file_path=f.path).size(),
            reverse=True)
        return files[0], files[1:]

    def _upgrade_file(self, new_file, old_file):
        # print('Upgrading old file:', old_file.id, old_file.path, ' -> ', new_file.id, new_file.path)
        self._session.query(URL). \
         filter(URL.file_id == old_file.id). \
         update({URL.file_id: new_file.id})
        file = SanitizedRelFile(base=settings.get("output.base_dir"),
                                file_path=old_file.path)
        if file.is_file():
            file.delete_file()

    def _prune(self):
        with self._lock:
            files_id = set(r.id for r in self._session.query(File))
            url_files_id = set(
                int(r.file_id) for r in self._session.query(URL))
            orphans = self._session.query(File).filter(
                File.id.in_(files_id.difference(url_files_id))).delete(
                    synchronize_session='fetch')
            #orphans = self._session.query(File).filter(~File.urls.any()).delete(synchronize_session='fetch')
            self._session.commit()
            if orphans:
                print("Deleted orphan Files:", orphans, debug=True)