コード例 #1
0
 def _choose_best_file(self, files):
     files = sorted(
         files,
         key=lambda f: SanitizedRelFile(
             base=settings.get("output.base_dir"), file_path=f.path).size(),
         reverse=True)
     return files[0], files[1:]
コード例 #2
0
def api_search_posts(fields, term, page_size, page):
    ret = []
    searcher = sql.PostSearcher(_session)
    res = searcher.search_fields(fields, term.strip("%"))
    full_len = len(res)
    res = res[page * page_size:page * page_size + page_size]
    for p in res:
        files = []
        for url in p.urls:
            if not url.file:
                print('Post URL Missing a File:', url)
                continue
            file = SanitizedRelFile(base=settings.get("output.base_dir"),
                                    file_path=url.file.path)
            if file.is_file():
                files.append({'token': url.file.id, 'path': file.absolute()})
        if len(files):
            ret.append({
                'reddit_id': p.reddit_id,
                'author': p.author,
                'type': p.type,
                'title': p.title,
                'body': p.body,
                'parent_id': p.parent_id,
                'subreddit': p.subreddit,
                'over_18': p.over_18,
                'created_utc': p.created_utc,
                'num_comments': p.num_comments,
                'score': p.score,
                'source_alias': p.source_alias,
                'files': files
            })
    return {'total': full_len, 'results': ret}
コード例 #3
0
def mock_handler_request(base_dir, target_url):
	""" Simplify generating a HandlerTask, DownloaderProgress, & RelFile object combo, for Handler tests. """
	import processing.handlers as handlers
	from processing.wrappers import SanitizedRelFile, DownloaderProgress
	filename = str(uuid.uuid4())
	_file = SanitizedRelFile(base=base_dir, file_path=filename)
	_task = handlers.HandlerTask(url=target_url, file_obj=_file)
	_prog = DownloaderProgress()
	return _task, _prog, _file
コード例 #4
0
 def _upgrade_file(self, new_file, old_file):
     # print('Upgrading old file:', old_file.id, old_file.path, ' -> ', new_file.id, new_file.path)
     self._session.query(URL). \
      filter(URL.file_id == old_file.id). \
      update({URL.file_id: new_file.id})
     file = SanitizedRelFile(base=settings.get("output.base_dir"),
                             file_path=old_file.path)
     if file.is_file():
         file.delete_file()
コード例 #5
0
def init_from_settings():
    """ Builds the database file using the Settings currently loaded. """
    # db_file = SanitizedRelFile(base=settings.get("output.base_dir"), file_path="manifest.sqlite")  # original
    db_file = SanitizedRelFile(
        base=settings.get("output.manifest_for_sqlite_dir"),
        file_path="manifest.sqlite"
    )  # This is part of the change to save manifest.sqlite to a different directory than the downloads
    db_file.mkdirs()
    init(db_file.absolute())
コード例 #6
0
    def _dedupe(self):
        unfinished = self._session\
         .query(File) \
         .options(joinedload(File.urls))\
         .filter(File.hash == None)\
         .filter(File.downloaded == True)\
         .all()

        unfinished = list(
            filter(lambda _f: not any(u.album_id for u in _f.urls),
                   unfinished))  # Filter out albums.

        if not unfinished:
            return

        for idx, f in enumerate(unfinished):
            self.progress.set_status("Deduplicating (%s) files..." %
                                     (len(unfinished) - idx))
            path = SanitizedRelFile(base=settings.get("output.base_dir"),
                                    file_path=f.path)
            is_album = any(u.album_id for u in f.urls)
            if not path.is_file() or is_album:
                continue
            new_hash = FileHasher.get_best_hash(path.absolute())
            # print('New hash for File:', f.id, '::', new_hash)
            matches = self._find_matching_files(new_hash, ignore_id=f.id)
            # print('\tActual matches:', matches)
            with self._lock:
                f.hash = Hash.make_hash(f, new_hash)
                if len(matches):
                    # print("Found duplicate files: ", new_hash, "::", [(m.id, m.path) for m in matches])
                    best, others = self._choose_best_file(matches + [f])
                    # print('Chose best File:', best.id)
                    for o in others:
                        self._upgrade_file(new_file=best, old_file=o)
                self._session.commit()
        self._prune()
コード例 #7
0
def init_from_settings():
    """ Builds the database file using the Settings currently loaded. """
    db_file = SanitizedRelFile(base=settings.get("output.base_dir"),
                               file_path="manifest.sqlite")
    db_file.mkdirs()
    init(db_file.absolute())
コード例 #8
0
    def run(self):
        """ Threaded loading of elements. """
        settings.from_json(self._settings)
        sql.init_from_settings()
        self._session = sql.session()
        self.progress.clear(status="Starting up...", running=True)
        failed = False

        for nxt_id in self._reader:
            try:
                url = self._session.query(
                    sql.URL).filter(sql.URL.id == nxt_id).first()
                if not url:
                    raise Exception("Unknown URL ID provided: (%s}" % nxt_id)

                file = url.file
                path = SanitizedRelFile(base=settings.get("output.base_dir"),
                                        file_path=str(file.path))

                self.progress.set_file(path.relative())
                self.progress.set_status("Attempting to Handle URL...")
                self.progress.set_running(True)

                task = handlers.HandlerTask(url=url.address, file_obj=path)
                resp = handlers.handle(task, self.progress)

                is_album_parent = False

                with self._db_lock:
                    if resp.album_urls:
                        if url.album_id:
                            resp.album_urls = [
                            ]  # Ignore nested Albums to avoid recursion.
                        else:
                            url.album_id = str(uuid.uuid4())
                            is_album_parent = True
                    else:
                        resp.album_urls = []

                    url.failed = not resp.success
                    url.failure_reason = resp.failure_reason
                    url.last_handler = resp.handler
                    url.album_is_parent = is_album_parent

                    if resp.rel_file:
                        file.downloaded = True
                        file.path = resp.rel_file.relative()
                        file.hash = None
                        utime(resp.rel_file.absolute(), times=(time(), time()))

                    self._session.commit()

                # Once *all* processing is completed on this URL, the Downloader needs to ACK it.
                # If any additional Album URLS were located, they should be sent before the ACK.
                self._ack_queue.put(
                    AckPacket(url_id=nxt_id, extra_urls=resp.album_urls))
                self.progress.clear(status="Waiting for URL...")
            except Exception as ex:
                failed = str(ex)
                self._ack_queue.put(AckPacket(url_id=nxt_id, extra_urls=[]))
                print(ex)
                traceback.print_exc()
                self.progress.set_error("Exited with error: {%s}" % failed)
                break

        sql.close()
        self.progress.clear(
            "Finished." if not failed else "Exited with error: %s" % failed,
            running=False)
コード例 #9
0
    def _dedupe(self):
        # unfinished = self._session\
        # 	.query(File) \
        # 	.options(joinedload(File.urls))\
        # 	.filter(File.hash == None)\
        # 	.filter(File.downloaded == True)\
        # 	.all()
        start_time = datetime.now()

        hashed = set(int(r.file_id) for r in self._session.query(Hash.file_id) \
                     .filter(Hash.full_hash != None, Hash.file_id != None))
        downloaded = set(
            r.id
            for r in self._session.query(File).filter(File.downloaded == True))
        # get downloaded files without a hash
        search_ids = downloaded.difference(hashed).difference(
            self.dedup_ignore_ids)
        unfinished = self._session.query(File).filter(
            File.id.in_(search_ids)).all()

        unfinished = list(
            filter(lambda _f: not any(u.album_id for u in _f.urls),
                   unfinished))  # Filter out albums.

        #print("Working on %s files total"%len(unfinished), debug=True)

        if not unfinished:
            return 0

        stats = {
            'unique': 0,
            'has_dup': 0,
            'special_hash': 0,
            'not_is_file': 0,
            'is_album': 0
        }
        matches = []
        last_printed = ''
        for idx, f in enumerate(unfinished):
            self.progress.set_status("Deduplicating %s of %s files..." %
                                     (idx + 1, len(unfinished)))
            #print("Working on  %s/%s files"%(idx, len(unfinished)), debug=True)
            path = SanitizedRelFile(base=settings.get("output.base_dir"),
                                    file_path=f.path)
            is_album = any(u.album_id for u in f.urls)
            if not path.is_file():
                stats['not_is_file'] += 1
                self.dedup_ignore_ids.add(f.id)
                continue
            if is_album:
                stats['is_album'] += 1
                self.dedup_ignore_ids.add(f.id)
                continue
            if self._stop_event.is_set():
                break
            new_hash = FileHasher.get_best_hash(path.absolute())
            # print('New hash for File:', f.id, '::', new_hash)
            for h in self.special_hashes:
                if new_hash == h.full_hash:
                    print("Found special hash:", h, "::\n", f, debug=True)
                    stats['special_hash'] += 1
                    with self._lock:
                        f.hash = Hash.make_hash(f, new_hash)
                        self._session.query(URL).filter(
                            URL.file_id == f.id).update(
                                {URL.file_id: h.file_id})
                        file = SanitizedRelFile(
                            base=settings.get("output.base_dir"),
                            file_path=f.path)
                        if file.is_file():
                            file.delete_file()
                        self._session.commit()
                        break
            else:  # not a special hash
                matches = self._find_matching_files(new_hash, ignore_id=f.id)
                if matches:
                    if new_hash == last_printed:
                        print("Found another duplicate:",
                              new_hash,
                              "::\n",
                              f,
                              debug=True)
                    elif len(matches) > 6:
                        printed = matches[:3] + [
                            "... %s total matches ..." % len(matches)
                        ] + matches[-3:]
                        print("Found duplicate files: ",
                              new_hash,
                              "::\n",
                              '\n'.join(str(m) for m in [f] + printed),
                              debug=True)
                    else:
                        print("Found duplicate files: ",
                              new_hash,
                              "::\n",
                              '\n'.join(str(m) for m in [f] + matches),
                              debug=True)
                    stats['has_dup'] += 1
                    last_printed = new_hash
                else:
                    stats['unique'] += 1
                # print('\tActual matches:', matches)
                with self._lock:
                    f.hash = Hash.make_hash(f, new_hash)
                    #print("Updating hash: ", f.id, f.hash.file_id, f.hash, debug=True)
                    if len(matches):
                        #print("Found duplicate files: ", new_hash, "::", [(m.id, m.path) for m in matches])
                        best, others = self._choose_best_file(matches + [f])
                        # print('Chose best File:', best.id)
                        for o in others:
                            self._upgrade_file(new_file=best, old_file=o)
                    self._session.commit()
                if matches:
                    print("Completed %s of %s files..." %
                          (idx + 1, len(unfinished)),
                          debug=True)
        dt = datetime.now() - start_time
        print("Completed all %s files in %s sec. Counts = %s" %
              (len(unfinished), str(dt), ', '.join(
                  '%s: %s' % (k, v) for k, v in stats.items() if v)),
              debug=True)
        # self.prune_counter += len(matches)
        # if self.prune_counter >= 100:
        # 	self.prune_counter = 0
        #self.progress.set_status("Pruning orphaned files...")
        #self._prune()
        #print("Finished pruning.", debug=True)
        return len(unfinished)