def api_search_posts(fields, term, page_size, page): ret = [] searcher = sql.PostSearcher(_session) res = searcher.search_fields(fields, term.strip("%")) full_len = len(res) res = res[page * page_size:page * page_size + page_size] for p in res: files = [] for url in p.urls: if not url.file: print('Post URL Missing a File:', url) continue file = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=url.file.path) if file.is_file(): files.append({'token': url.file.id, 'path': file.absolute()}) if len(files): ret.append({ 'reddit_id': p.reddit_id, 'author': p.author, 'type': p.type, 'title': p.title, 'body': p.body, 'parent_id': p.parent_id, 'subreddit': p.subreddit, 'over_18': p.over_18, 'created_utc': p.created_utc, 'num_comments': p.num_comments, 'score': p.score, 'source_alias': p.source_alias, 'files': files }) return {'total': full_len, 'results': ret}
def init_from_settings(): """ Builds the database file using the Settings currently loaded. """ # db_file = SanitizedRelFile(base=settings.get("output.base_dir"), file_path="manifest.sqlite") # original db_file = SanitizedRelFile( base=settings.get("output.manifest_for_sqlite_dir"), file_path="manifest.sqlite" ) # This is part of the change to save manifest.sqlite to a different directory than the downloads db_file.mkdirs() init(db_file.absolute())
def _dedupe(self): unfinished = self._session\ .query(File) \ .options(joinedload(File.urls))\ .filter(File.hash == None)\ .filter(File.downloaded == True)\ .all() unfinished = list( filter(lambda _f: not any(u.album_id for u in _f.urls), unfinished)) # Filter out albums. if not unfinished: return for idx, f in enumerate(unfinished): self.progress.set_status("Deduplicating (%s) files..." % (len(unfinished) - idx)) path = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=f.path) is_album = any(u.album_id for u in f.urls) if not path.is_file() or is_album: continue new_hash = FileHasher.get_best_hash(path.absolute()) # print('New hash for File:', f.id, '::', new_hash) matches = self._find_matching_files(new_hash, ignore_id=f.id) # print('\tActual matches:', matches) with self._lock: f.hash = Hash.make_hash(f, new_hash) if len(matches): # print("Found duplicate files: ", new_hash, "::", [(m.id, m.path) for m in matches]) best, others = self._choose_best_file(matches + [f]) # print('Chose best File:', best.id) for o in others: self._upgrade_file(new_file=best, old_file=o) self._session.commit() self._prune()
def init_from_settings(): """ Builds the database file using the Settings currently loaded. """ db_file = SanitizedRelFile(base=settings.get("output.base_dir"), file_path="manifest.sqlite") db_file.mkdirs() init(db_file.absolute())
def _dedupe(self): # unfinished = self._session\ # .query(File) \ # .options(joinedload(File.urls))\ # .filter(File.hash == None)\ # .filter(File.downloaded == True)\ # .all() start_time = datetime.now() hashed = set(int(r.file_id) for r in self._session.query(Hash.file_id) \ .filter(Hash.full_hash != None, Hash.file_id != None)) downloaded = set( r.id for r in self._session.query(File).filter(File.downloaded == True)) # get downloaded files without a hash search_ids = downloaded.difference(hashed).difference( self.dedup_ignore_ids) unfinished = self._session.query(File).filter( File.id.in_(search_ids)).all() unfinished = list( filter(lambda _f: not any(u.album_id for u in _f.urls), unfinished)) # Filter out albums. #print("Working on %s files total"%len(unfinished), debug=True) if not unfinished: return 0 stats = { 'unique': 0, 'has_dup': 0, 'special_hash': 0, 'not_is_file': 0, 'is_album': 0 } matches = [] last_printed = '' for idx, f in enumerate(unfinished): self.progress.set_status("Deduplicating %s of %s files..." % (idx + 1, len(unfinished))) #print("Working on %s/%s files"%(idx, len(unfinished)), debug=True) path = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=f.path) is_album = any(u.album_id for u in f.urls) if not path.is_file(): stats['not_is_file'] += 1 self.dedup_ignore_ids.add(f.id) continue if is_album: stats['is_album'] += 1 self.dedup_ignore_ids.add(f.id) continue if self._stop_event.is_set(): break new_hash = FileHasher.get_best_hash(path.absolute()) # print('New hash for File:', f.id, '::', new_hash) for h in self.special_hashes: if new_hash == h.full_hash: print("Found special hash:", h, "::\n", f, debug=True) stats['special_hash'] += 1 with self._lock: f.hash = Hash.make_hash(f, new_hash) self._session.query(URL).filter( URL.file_id == f.id).update( {URL.file_id: h.file_id}) file = SanitizedRelFile( base=settings.get("output.base_dir"), file_path=f.path) if file.is_file(): file.delete_file() self._session.commit() break else: # not a special hash matches = self._find_matching_files(new_hash, ignore_id=f.id) if matches: if new_hash == last_printed: print("Found another duplicate:", new_hash, "::\n", f, debug=True) elif len(matches) > 6: printed = matches[:3] + [ "... %s total matches ..." % len(matches) ] + matches[-3:] print("Found duplicate files: ", new_hash, "::\n", '\n'.join(str(m) for m in [f] + printed), debug=True) else: print("Found duplicate files: ", new_hash, "::\n", '\n'.join(str(m) for m in [f] + matches), debug=True) stats['has_dup'] += 1 last_printed = new_hash else: stats['unique'] += 1 # print('\tActual matches:', matches) with self._lock: f.hash = Hash.make_hash(f, new_hash) #print("Updating hash: ", f.id, f.hash.file_id, f.hash, debug=True) if len(matches): #print("Found duplicate files: ", new_hash, "::", [(m.id, m.path) for m in matches]) best, others = self._choose_best_file(matches + [f]) # print('Chose best File:', best.id) for o in others: self._upgrade_file(new_file=best, old_file=o) self._session.commit() if matches: print("Completed %s of %s files..." % (idx + 1, len(unfinished)), debug=True) dt = datetime.now() - start_time print("Completed all %s files in %s sec. Counts = %s" % (len(unfinished), str(dt), ', '.join( '%s: %s' % (k, v) for k, v in stats.items() if v)), debug=True) # self.prune_counter += len(matches) # if self.prune_counter >= 100: # self.prune_counter = 0 #self.progress.set_status("Pruning orphaned files...") #self._prune() #print("Finished pruning.", debug=True) return len(unfinished)