def _choose_best_file(self, files): files = sorted( files, key=lambda f: SanitizedRelFile( base=settings.get("output.base_dir"), file_path=f.path).size(), reverse=True) return files[0], files[1:]
def api_search_posts(fields, term, page_size, page): ret = [] searcher = sql.PostSearcher(_session) res = searcher.search_fields(fields, term.strip("%")) full_len = len(res) res = res[page * page_size:page * page_size + page_size] for p in res: files = [] for url in p.urls: if not url.file: print('Post URL Missing a File:', url) continue file = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=url.file.path) if file.is_file(): files.append({'token': url.file.id, 'path': file.absolute()}) if len(files): ret.append({ 'reddit_id': p.reddit_id, 'author': p.author, 'type': p.type, 'title': p.title, 'body': p.body, 'parent_id': p.parent_id, 'subreddit': p.subreddit, 'over_18': p.over_18, 'created_utc': p.created_utc, 'num_comments': p.num_comments, 'score': p.score, 'source_alias': p.source_alias, 'files': files }) return {'total': full_len, 'results': ret}
def mock_handler_request(base_dir, target_url): """ Simplify generating a HandlerTask, DownloaderProgress, & RelFile object combo, for Handler tests. """ import processing.handlers as handlers from processing.wrappers import SanitizedRelFile, DownloaderProgress filename = str(uuid.uuid4()) _file = SanitizedRelFile(base=base_dir, file_path=filename) _task = handlers.HandlerTask(url=target_url, file_obj=_file) _prog = DownloaderProgress() return _task, _prog, _file
def _upgrade_file(self, new_file, old_file): # print('Upgrading old file:', old_file.id, old_file.path, ' -> ', new_file.id, new_file.path) self._session.query(URL). \ filter(URL.file_id == old_file.id). \ update({URL.file_id: new_file.id}) file = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=old_file.path) if file.is_file(): file.delete_file()
def init_from_settings(): """ Builds the database file using the Settings currently loaded. """ # db_file = SanitizedRelFile(base=settings.get("output.base_dir"), file_path="manifest.sqlite") # original db_file = SanitizedRelFile( base=settings.get("output.manifest_for_sqlite_dir"), file_path="manifest.sqlite" ) # This is part of the change to save manifest.sqlite to a different directory than the downloads db_file.mkdirs() init(db_file.absolute())
def _dedupe(self): unfinished = self._session\ .query(File) \ .options(joinedload(File.urls))\ .filter(File.hash == None)\ .filter(File.downloaded == True)\ .all() unfinished = list( filter(lambda _f: not any(u.album_id for u in _f.urls), unfinished)) # Filter out albums. if not unfinished: return for idx, f in enumerate(unfinished): self.progress.set_status("Deduplicating (%s) files..." % (len(unfinished) - idx)) path = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=f.path) is_album = any(u.album_id for u in f.urls) if not path.is_file() or is_album: continue new_hash = FileHasher.get_best_hash(path.absolute()) # print('New hash for File:', f.id, '::', new_hash) matches = self._find_matching_files(new_hash, ignore_id=f.id) # print('\tActual matches:', matches) with self._lock: f.hash = Hash.make_hash(f, new_hash) if len(matches): # print("Found duplicate files: ", new_hash, "::", [(m.id, m.path) for m in matches]) best, others = self._choose_best_file(matches + [f]) # print('Chose best File:', best.id) for o in others: self._upgrade_file(new_file=best, old_file=o) self._session.commit() self._prune()
def init_from_settings(): """ Builds the database file using the Settings currently loaded. """ db_file = SanitizedRelFile(base=settings.get("output.base_dir"), file_path="manifest.sqlite") db_file.mkdirs() init(db_file.absolute())
def run(self): """ Threaded loading of elements. """ settings.from_json(self._settings) sql.init_from_settings() self._session = sql.session() self.progress.clear(status="Starting up...", running=True) failed = False for nxt_id in self._reader: try: url = self._session.query( sql.URL).filter(sql.URL.id == nxt_id).first() if not url: raise Exception("Unknown URL ID provided: (%s}" % nxt_id) file = url.file path = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=str(file.path)) self.progress.set_file(path.relative()) self.progress.set_status("Attempting to Handle URL...") self.progress.set_running(True) task = handlers.HandlerTask(url=url.address, file_obj=path) resp = handlers.handle(task, self.progress) is_album_parent = False with self._db_lock: if resp.album_urls: if url.album_id: resp.album_urls = [ ] # Ignore nested Albums to avoid recursion. else: url.album_id = str(uuid.uuid4()) is_album_parent = True else: resp.album_urls = [] url.failed = not resp.success url.failure_reason = resp.failure_reason url.last_handler = resp.handler url.album_is_parent = is_album_parent if resp.rel_file: file.downloaded = True file.path = resp.rel_file.relative() file.hash = None utime(resp.rel_file.absolute(), times=(time(), time())) self._session.commit() # Once *all* processing is completed on this URL, the Downloader needs to ACK it. # If any additional Album URLS were located, they should be sent before the ACK. self._ack_queue.put( AckPacket(url_id=nxt_id, extra_urls=resp.album_urls)) self.progress.clear(status="Waiting for URL...") except Exception as ex: failed = str(ex) self._ack_queue.put(AckPacket(url_id=nxt_id, extra_urls=[])) print(ex) traceback.print_exc() self.progress.set_error("Exited with error: {%s}" % failed) break sql.close() self.progress.clear( "Finished." if not failed else "Exited with error: %s" % failed, running=False)
def _dedupe(self): # unfinished = self._session\ # .query(File) \ # .options(joinedload(File.urls))\ # .filter(File.hash == None)\ # .filter(File.downloaded == True)\ # .all() start_time = datetime.now() hashed = set(int(r.file_id) for r in self._session.query(Hash.file_id) \ .filter(Hash.full_hash != None, Hash.file_id != None)) downloaded = set( r.id for r in self._session.query(File).filter(File.downloaded == True)) # get downloaded files without a hash search_ids = downloaded.difference(hashed).difference( self.dedup_ignore_ids) unfinished = self._session.query(File).filter( File.id.in_(search_ids)).all() unfinished = list( filter(lambda _f: not any(u.album_id for u in _f.urls), unfinished)) # Filter out albums. #print("Working on %s files total"%len(unfinished), debug=True) if not unfinished: return 0 stats = { 'unique': 0, 'has_dup': 0, 'special_hash': 0, 'not_is_file': 0, 'is_album': 0 } matches = [] last_printed = '' for idx, f in enumerate(unfinished): self.progress.set_status("Deduplicating %s of %s files..." % (idx + 1, len(unfinished))) #print("Working on %s/%s files"%(idx, len(unfinished)), debug=True) path = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=f.path) is_album = any(u.album_id for u in f.urls) if not path.is_file(): stats['not_is_file'] += 1 self.dedup_ignore_ids.add(f.id) continue if is_album: stats['is_album'] += 1 self.dedup_ignore_ids.add(f.id) continue if self._stop_event.is_set(): break new_hash = FileHasher.get_best_hash(path.absolute()) # print('New hash for File:', f.id, '::', new_hash) for h in self.special_hashes: if new_hash == h.full_hash: print("Found special hash:", h, "::\n", f, debug=True) stats['special_hash'] += 1 with self._lock: f.hash = Hash.make_hash(f, new_hash) self._session.query(URL).filter( URL.file_id == f.id).update( {URL.file_id: h.file_id}) file = SanitizedRelFile( base=settings.get("output.base_dir"), file_path=f.path) if file.is_file(): file.delete_file() self._session.commit() break else: # not a special hash matches = self._find_matching_files(new_hash, ignore_id=f.id) if matches: if new_hash == last_printed: print("Found another duplicate:", new_hash, "::\n", f, debug=True) elif len(matches) > 6: printed = matches[:3] + [ "... %s total matches ..." % len(matches) ] + matches[-3:] print("Found duplicate files: ", new_hash, "::\n", '\n'.join(str(m) for m in [f] + printed), debug=True) else: print("Found duplicate files: ", new_hash, "::\n", '\n'.join(str(m) for m in [f] + matches), debug=True) stats['has_dup'] += 1 last_printed = new_hash else: stats['unique'] += 1 # print('\tActual matches:', matches) with self._lock: f.hash = Hash.make_hash(f, new_hash) #print("Updating hash: ", f.id, f.hash.file_id, f.hash, debug=True) if len(matches): #print("Found duplicate files: ", new_hash, "::", [(m.id, m.path) for m in matches]) best, others = self._choose_best_file(matches + [f]) # print('Chose best File:', best.id) for o in others: self._upgrade_file(new_file=best, old_file=o) self._session.commit() if matches: print("Completed %s of %s files..." % (idx + 1, len(unfinished)), debug=True) dt = datetime.now() - start_time print("Completed all %s files in %s sec. Counts = %s" % (len(unfinished), str(dt), ', '.join( '%s: %s' % (k, v) for k, v in stats.items() if v)), debug=True) # self.prune_counter += len(matches) # if self.prune_counter >= 100: # self.prune_counter = 0 #self.progress.set_status("Pruning orphaned files...") #self._prune() #print("Finished pruning.", debug=True) return len(unfinished)