def GetTotalSize(self, hash_ids: typing.Collection[int]) -> int: if len(hash_ids) == 1: (hash_id, ) = hash_ids result = self._c.execute( 'SELECT size FROM files_info WHERE hash_id = ?;', (hash_id, )).fetchone() else: with HydrusDB.TemporaryIntegerTable( self._c, hash_ids, 'hash_id') as temp_hash_ids_table_name: result = self._c.execute( 'SELECT SUM( size ) FROM {} CROSS JOIN files_info USING ( hash_id );' .format(temp_hash_ids_table_name)).fetchone() if result is None: return 0 (total_size, ) = result return total_size
def FilterAllPendingHashIds(self, hash_ids, just_these_service_ids=None): if just_these_service_ids is None: service_ids = self.modules_services.GetServiceIds( HC.SPECIFIC_FILE_SERVICES) else: service_ids = just_these_service_ids pending_hash_ids = set() with HydrusDB.TemporaryIntegerTable( self._c, hash_ids, 'hash_id') as temp_hash_ids_table_name: for service_id in service_ids: pending_files_table_name = GenerateFilesTableName( service_id, HC.CONTENT_STATUS_PENDING) hash_id_iterator = self._STI( self._c.execute( 'SELECT hash_id FROM {} CROSS JOIN {} USING ( hash_id );' .format(temp_hash_ids_table_name, pending_files_table_name))) pending_hash_ids.update(hash_id_iterator) return pending_hash_ids
def _PopulateTagIdsToTagsCache( self, tag_ids ): if len( self._tag_ids_to_tags_cache ) > 100000: if not isinstance( tag_ids, set ): tag_ids = set( tag_ids ) self._tag_ids_to_tags_cache = { tag_id : tag for ( tag_id, tag ) in self._tag_ids_to_tags_cache.items() if tag_id in tag_ids } uncached_tag_ids = { tag_id for tag_id in tag_ids if tag_id not in self._tag_ids_to_tags_cache } if len( uncached_tag_ids ) > 0: if len( uncached_tag_ids ) == 1: ( uncached_tag_id, ) = uncached_tag_ids rows = self._c.execute( 'SELECT tag_id, namespace, subtag FROM tags NATURAL JOIN namespaces NATURAL JOIN subtags WHERE tag_id = ?;', ( uncached_tag_id, ) ).fetchall() else: with HydrusDB.TemporaryIntegerTable( self._c, uncached_tag_ids, 'tag_id' ) as temp_table_name: # temp tag_ids to tags to subtags and namespaces rows = self._c.execute( 'SELECT tag_id, namespace, subtag FROM {} CROSS JOIN tags USING ( tag_id ) CROSS JOIN subtags USING ( subtag_id ) CROSS JOIN namespaces USING ( namespace_id );'.format( temp_table_name ) ).fetchall() uncached_tag_ids_to_tags = { tag_id : HydrusTags.CombineTag( namespace, subtag ) for ( tag_id, namespace, subtag ) in rows } if len( uncached_tag_ids_to_tags ) < len( uncached_tag_ids ): for tag_id in uncached_tag_ids: if tag_id not in uncached_tag_ids_to_tags: tag = 'unknown tag:' + HydrusData.GenerateKey().hex() ( namespace, subtag ) = HydrusTags.SplitTag( tag ) namespace_id = self.GetNamespaceId( namespace ) subtag_id = self.GetSubtagId( subtag ) self._c.execute( 'REPLACE INTO tags ( tag_id, namespace_id, subtag_id ) VALUES ( ?, ?, ? );', ( tag_id, namespace_id, subtag_id ) ) uncached_tag_ids_to_tags[ tag_id ] = tag self._tag_ids_to_tags_cache.update( uncached_tag_ids_to_tags )
def GetHashIdsThatHaveTagsSimpleLocation( self, tag_display_type: int, file_service_key: bytes, tag_search_context: ClientSearch.TagSearchContext, namespace_ids_table_name = None, hash_ids_table_name = None, job_key = None ): mapping_and_tag_table_names = self.modules_tag_search.GetMappingAndTagTables( tag_display_type, file_service_key, tag_search_context ) if hash_ids_table_name is None: if namespace_ids_table_name is None: # hellmode queries = [ 'SELECT DISTINCT hash_id FROM {};'.format( mappings_table_name ) for ( mappings_table_name, tags_table_name ) in mapping_and_tag_table_names ] else: # temp namespaces to tags to mappings queries = [ 'SELECT DISTINCT hash_id FROM {} CROSS JOIN {} USING ( namespace_id ) CROSS JOIN {} USING ( tag_id );'.format( namespace_ids_table_name, tags_table_name, mappings_table_name ) for ( mappings_table_name, tags_table_name ) in mapping_and_tag_table_names ] else: if namespace_ids_table_name is None: queries = [ 'SELECT hash_id FROM {} WHERE EXISTS ( SELECT 1 FROM {} WHERE {}.hash_id = {}.hash_id );'.format( hash_ids_table_name, mappings_table_name, mappings_table_name, hash_ids_table_name ) for ( mappings_table_name, tags_table_name ) in mapping_and_tag_table_names ] else: # temp hashes to mappings to tags to temp namespaces # this was originally a 'WHERE EXISTS' thing, but doing that on a three way cross join is too complex for that to work well # let's hope DISTINCT can save time too queries = [ 'SELECT DISTINCT hash_id FROM {} CROSS JOIN {} USING ( hash_id ) CROSS JOIN {} USING ( tag_id ) CROSS JOIN {} USING ( namespace_id );'.format( hash_ids_table_name, mappings_table_name, tags_table_name, namespace_ids_table_name ) for ( mappings_table_name, tags_table_name ) in mapping_and_tag_table_names ] cancelled_hook = None if job_key is not None: cancelled_hook = job_key.IsCancelled nonzero_tag_hash_ids = set() for query in queries: cursor = self._Execute( query ) nonzero_tag_hash_ids.update( self._STI( HydrusDB.ReadFromCancellableCursor( cursor, 10240, cancelled_hook ) ) ) if job_key is not None and job_key.IsCancelled(): return set() return nonzero_tag_hash_ids
def RegenerateSearchableSubtagMap(self, file_service_id, tag_service_id, status_hook=None): subtags_fts4_table_name = self.GetSubtagsFTS4TableName( file_service_id, tag_service_id) subtags_searchable_map_table_name = self.GetSubtagsSearchableMapTableName( file_service_id, tag_service_id) self._Execute( 'DELETE FROM {};'.format(subtags_searchable_map_table_name)) query = 'SELECT docid FROM {};'.format(subtags_fts4_table_name) BLOCK_SIZE = 10000 for (group_of_subtag_ids, num_done, num_to_do) in HydrusDB.ReadLargeIdQueryInSeparateChunks( self._c, query, BLOCK_SIZE): for subtag_id in group_of_subtag_ids: result = self._Execute( 'SELECT subtag FROM subtags WHERE subtag_id = ?;', (subtag_id, )).fetchone() if result is None: continue (subtag, ) = result searchable_subtag = ClientSearch.ConvertSubtagToSearchable( subtag) if searchable_subtag != subtag: searchable_subtag_id = self.modules_tags.GetSubtagId( searchable_subtag) self._Execute( 'INSERT OR IGNORE INTO {} ( subtag_id, searchable_subtag_id ) VALUES ( ?, ? );' .format(subtags_searchable_map_table_name), (subtag_id, searchable_subtag_id)) message = HydrusData.ConvertValueRangeToPrettyString( num_done, num_to_do) HG.client_controller.frame_splash_status.SetSubtext(message) if status_hook is not None: status_hook(message)
def ClearDeleteRecord(self, service_id, hash_ids): deleted_files_table_name = GenerateFilesTableName( service_id, HC.CONTENT_STATUS_DELETED) self._c.executemany( 'DELETE FROM {} WHERE hash_id = ?;'.format( deleted_files_table_name), ((hash_id, ) for hash_id in hash_ids)) num_deleted = HydrusDB.GetRowCount(self._c) return num_deleted
def GetUndeleteRows(self, service_id, hash_ids): deleted_files_table_name = GenerateFilesTableName( service_id, HC.CONTENT_STATUS_DELETED) with HydrusDB.TemporaryIntegerTable( self._c, hash_ids, 'hash_id') as temp_hash_ids_table_name: rows = self._c.execute( 'SELECT hash_id, original_timestamp FROM {} CROSS JOIN {} USING ( hash_id );' .format(temp_hash_ids_table_name, deleted_files_table_name)).fetchall() return rows
def GetCurrentHashIdsToTimestamps(self, service_id, hash_ids): current_files_table_name = GenerateFilesTableName( service_id, HC.CONTENT_STATUS_CURRENT) with HydrusDB.TemporaryIntegerTable( self._c, hash_ids, 'hash_id') as temp_hash_ids_table_name: rows = dict( self._c.execute( 'SELECT hash_id, timestamp FROM {} CROSS JOIN {} USING ( hash_id );' .format(temp_hash_ids_table_name, current_files_table_name))) return rows
def RecordDeleteFiles(self, service_id, insert_rows): deleted_files_table_name = GenerateFilesTableName( service_id, HC.CONTENT_STATUS_DELETED) now = HydrusData.GetNow() self._c.executemany( 'INSERT OR IGNORE INTO {} ( hash_id, timestamp, original_timestamp ) VALUES ( ?, ?, ? );' .format(deleted_files_table_name), ((hash_id, now, original_timestamp) for (hash_id, original_timestamp) in insert_rows)) num_new_deleted_files = HydrusDB.GetRowCount(self._c) return num_new_deleted_files
def FilterPendingHashIds(self, service_id, hash_ids): if service_id == self.modules_services.combined_file_service_id: return set(hash_ids) with HydrusDB.TemporaryIntegerTable( self._c, hash_ids, 'hash_id') as temp_hash_ids_table_name: pending_files_table_name = GenerateFilesTableName( service_id, HC.CONTENT_STATUS_PENDING) pending_hash_ids = self._STS( self._c.execute( 'SELECT hash_id FROM {} CROSS JOIN {} USING ( hash_id );'. format(temp_hash_ids_table_name, pending_files_table_name))) return pending_hash_ids
def AddFiles(self, service_id, insert_rows): (current_files_table_name, deleted_files_table_name, pending_files_table_name, petitioned_files_table_name) = GenerateFilesTableNames(service_id) self._c.executemany( 'INSERT OR IGNORE INTO {} VALUES ( ?, ? );'.format( current_files_table_name), ((hash_id, timestamp) for (hash_id, timestamp) in insert_rows)) self._c.executemany( 'DELETE FROM {} WHERE hash_id = ?;'.format( pending_files_table_name), ((hash_id, ) for (hash_id, timestamp) in insert_rows)) pending_changed = HydrusDB.GetRowCount(self._c) > 0 return pending_changed
def AssociatePHashes( self, hash_id, phashes ): phash_ids = set() for phash in phashes: phash_id = self._GetPHashId( phash ) phash_ids.add( phash_id ) self._c.executemany( 'INSERT OR IGNORE INTO shape_perceptual_hash_map ( phash_id, hash_id ) VALUES ( ?, ? );', ( ( phash_id, hash_id ) for phash_id in phash_ids ) ) if HydrusDB.GetRowCount( self._c ) > 0: self._c.execute( 'REPLACE INTO shape_search_cache ( hash_id, searched_distance ) VALUES ( ?, ? );', ( hash_id, None ) ) return phash_ids
def RemoveFiles(self, service_id, hash_ids): (current_files_table_name, deleted_files_table_name, pending_files_table_name, petitioned_files_table_name) = GenerateFilesTableNames(service_id) self._c.executemany( 'DELETE FROM {} WHERE hash_id = ?;'.format( current_files_table_name), ((hash_id, ) for hash_id in hash_ids)) self._c.executemany( 'DELETE FROM {} WHERE hash_id = ?;'.format( petitioned_files_table_name), ((hash_id, ) for hash_id in hash_ids)) pending_changed = HydrusDB.GetRowCount(self._c) > 0 return pending_changed
def _PopulateHashIdsToHashesCache( self, hash_ids ): if len( self._hash_ids_to_hashes_cache ) > 100000: if not isinstance( hash_ids, set ): hash_ids = set( hash_ids ) self._hash_ids_to_hashes_cache = { hash_id : hash for ( hash_id, hash ) in self._hash_ids_to_hashes_cache.items() if hash_id in hash_ids } uncached_hash_ids = { hash_id for hash_id in hash_ids if hash_id not in self._hash_ids_to_hashes_cache } if len( uncached_hash_ids ) > 0: if len( uncached_hash_ids ) == 1: ( uncached_hash_id, ) = uncached_hash_ids # this makes 0 or 1 rows, so do fetchall rather than fetchone local_uncached_hash_ids_to_hashes = { hash_id : hash for ( hash_id, hash ) in self._c.execute( 'SELECT hash_id, hash FROM local_hashes_cache WHERE hash_id = ?;', ( uncached_hash_id, ) ) } else: with HydrusDB.TemporaryIntegerTable( self._c, uncached_hash_ids, 'hash_id' ) as temp_table_name: # temp hash_ids to actual hashes local_uncached_hash_ids_to_hashes = { hash_id : hash for ( hash_id, hash ) in self._c.execute( 'SELECT hash_id, hash FROM {} CROSS JOIN local_hashes_cache USING ( hash_id );'.format( temp_table_name ) ) } self._hash_ids_to_hashes_cache.update( local_uncached_hash_ids_to_hashes ) uncached_hash_ids = { hash_id for hash_id in uncached_hash_ids if hash_id not in self._hash_ids_to_hashes_cache } if len( uncached_hash_ids ) > 0: hash_ids_to_hashes = self.modules_hashes.GetHashIdsToHashes( hash_ids = uncached_hash_ids ) self._hash_ids_to_hashes_cache.update( hash_ids_to_hashes )
def _PopulateTagIdsToTagsCache( self, tag_ids ): if len( self._tag_ids_to_tags_cache ) > 100000: if not isinstance( tag_ids, set ): tag_ids = set( tag_ids ) self._tag_ids_to_tags_cache = { tag_id : tag for ( tag_id, tag ) in self._tag_ids_to_tags_cache.items() if tag_id in tag_ids } uncached_tag_ids = { tag_id for tag_id in tag_ids if tag_id not in self._tag_ids_to_tags_cache } if len( uncached_tag_ids ) > 0: if len( uncached_tag_ids ) == 1: ( uncached_tag_id, ) = uncached_tag_ids # this makes 0 or 1 rows, so do fetchall rather than fetchone local_uncached_tag_ids_to_tags = { tag_id : tag for ( tag_id, tag ) in self._c.execute( 'SELECT tag_id, tag FROM local_tags_cache WHERE tag_id = ?;', ( uncached_tag_id, ) ) } else: with HydrusDB.TemporaryIntegerTable( self._c, uncached_tag_ids, 'tag_id' ) as temp_table_name: # temp tag_ids to actual tags local_uncached_tag_ids_to_tags = { tag_id : tag for ( tag_id, tag ) in self._c.execute( 'SELECT tag_id, tag FROM {} CROSS JOIN local_tags_cache USING ( tag_id );'.format( temp_table_name ) ) } self._tag_ids_to_tags_cache.update( local_uncached_tag_ids_to_tags ) uncached_tag_ids = { tag_id for tag_id in uncached_tag_ids if tag_id not in self._tag_ids_to_tags_cache } if len( uncached_tag_ids ) > 0: tag_ids_to_tags = self.modules_tags.GetTagIdsToTags( tag_ids = uncached_tag_ids ) self._tag_ids_to_tags_cache.update( tag_ids_to_tags )
def Repopulate(self): self.ClearCache() tag_service_ids = self.modules_services.GetServiceIds( HC.REAL_TAG_SERVICES) queries = [ self.modules_mappings_counts.GetQueryPhraseForCurrentTagIds( ClientTags.TAG_DISPLAY_STORAGE, self.modules_services.combined_local_file_service_id, tag_service_id) for tag_service_id in tag_service_ids ] full_query = '{};'.format(' UNION '.join(queries)) for (block_of_tag_ids, num_done, num_to_do) in HydrusDB.ReadLargeIdQueryInSeparateChunks( self._c, full_query, 1024): self.AddTagIdsToCache(block_of_tag_ids)
def GetNumViewable(self, hash_ids: typing.Collection[int]) -> int: if len(hash_ids) == 1: (hash_id, ) = hash_ids result = self._STL( self._c.execute( 'SELECT mime FROM files_info WHERE hash_id = ?;', (hash_id, ))) else: with HydrusDB.TemporaryIntegerTable( self._c, hash_ids, 'hash_id') as temp_hash_ids_table_name: result = self._STL( self._c.execute( 'SELECT mime FROM {} CROSS JOIN files_info USING ( hash_id );' .format(temp_hash_ids_table_name))) return sum((1 for mime in result if mime in HC.SEARCHABLE_MIMES))
def GetServiceIdCounts(self, hash_ids) -> typing.Dict[int, int]: with HydrusDB.TemporaryIntegerTable( self._c, hash_ids, 'hash_id') as temp_hash_ids_table_name: service_ids_to_counts = {} for service_id in self.modules_services.GetServiceIds( HC.SPECIFIC_FILE_SERVICES): current_files_table_name = GenerateFilesTableName( service_id, HC.CONTENT_STATUS_CURRENT) # temp hashes to files (count, ) = self._c.execute( 'SELECT COUNT( * ) FROM {} CROSS JOIN {} USING ( hash_id );' .format(temp_hash_ids_table_name, current_files_table_name)).fetchone() service_ids_to_counts[service_id] = count return service_ids_to_counts
def MaintainTree(self, maintenance_mode=HC.MAINTENANCE_FORCED, job_key=None, stop_time=None): time_started = HydrusData.GetNow() pub_job_key = False job_key_pubbed = False if job_key is None: job_key = ClientThreading.JobKey(cancellable=True) pub_job_key = True try: job_key.SetVariable('popup_title', 'similar files metadata maintenance') rebalance_phash_ids = self._STL( self._c.execute( 'SELECT phash_id FROM shape_maintenance_branch_regen;')) num_to_do = len(rebalance_phash_ids) while len(rebalance_phash_ids) > 0: if pub_job_key and not job_key_pubbed and HydrusData.TimeHasPassed( time_started + 5): HG.client_controller.pub('modal_message', job_key) job_key_pubbed = True (i_paused, should_quit) = job_key.WaitIfNeeded() should_stop = HG.client_controller.ShouldStopThisWork( maintenance_mode, stop_time=stop_time) if should_quit or should_stop: return num_done = num_to_do - len(rebalance_phash_ids) text = 'rebalancing similar file metadata - ' + HydrusData.ConvertValueRangeToPrettyString( num_done, num_to_do) HG.client_controller.frame_splash_status.SetSubtext(text) job_key.SetVariable('popup_text_1', text) job_key.SetVariable('popup_gauge_1', (num_done, num_to_do)) with HydrusDB.TemporaryIntegerTable( self._c, rebalance_phash_ids, 'phash_id') as temp_table_name: # temp phashes to tree (biggest_phash_id, ) = self._c.execute( 'SELECT phash_id FROM {} CROSS JOIN shape_vptree USING ( phash_id ) ORDER BY inner_population + outer_population DESC;' .format(temp_table_name)).fetchone() self._RegenerateBranch(job_key, biggest_phash_id) rebalance_phash_ids = self._STL( self._c.execute( 'SELECT phash_id FROM shape_maintenance_branch_regen;') ) finally: job_key.SetVariable('popup_text_1', 'done!') job_key.DeleteVariable('popup_gauge_1') job_key.DeleteVariable( 'popup_text_2') # used in the regenbranch call job_key.Finish() job_key.Delete(5)
def Generate(self, tag_service_id): self.modules_mappings_counts.CreateTables( ClientTags.TAG_DISPLAY_STORAGE, self.modules_services.combined_file_service_id, tag_service_id) # (current_mappings_table_name, deleted_mappings_table_name, pending_mappings_table_name, petitioned_mappings_table_name ) = ClientDBMappingsStorage.GenerateMappingsTableNames(tag_service_id) current_mappings_exist = self._Execute( 'SELECT 1 FROM ' + current_mappings_table_name + ' LIMIT 1;').fetchone() is not None pending_mappings_exist = self._Execute( 'SELECT 1 FROM ' + pending_mappings_table_name + ' LIMIT 1;').fetchone() is not None if current_mappings_exist or pending_mappings_exist: # not worth iterating through all known tags for an empty service for (group_of_ids, num_done, num_to_do) in HydrusDB.ReadLargeIdQueryInSeparateChunks( self._c, 'SELECT tag_id FROM tags;', 10000): # must be a cleverer way of doing this with self._MakeTemporaryIntegerTable( group_of_ids, 'tag_id') as temp_table_name: current_counter = collections.Counter() # temp tags to mappings for (tag_id, count) in self._Execute( 'SELECT tag_id, COUNT( * ) FROM {} CROSS JOIN {} USING ( tag_id ) GROUP BY ( tag_id );' .format(temp_table_name, current_mappings_table_name)): current_counter[tag_id] = count pending_counter = collections.Counter() # temp tags to mappings for (tag_id, count) in self._Execute( 'SELECT tag_id, COUNT( * ) FROM {} CROSS JOIN {} USING ( tag_id ) GROUP BY ( tag_id );' .format(temp_table_name, pending_mappings_table_name)): pending_counter[tag_id] = count all_ids_seen = set(current_counter.keys()) all_ids_seen.update(pending_counter.keys()) counts_cache_changes = [(tag_id, current_counter[tag_id], pending_counter[tag_id]) for tag_id in all_ids_seen] if len(counts_cache_changes) > 0: self.modules_mappings_counts_update.AddCounts( ClientTags.TAG_DISPLAY_STORAGE, self.modules_services.combined_file_service_id, tag_service_id, counts_cache_changes) self.modules_mappings_cache_combined_files_display.Generate( tag_service_id)
def GetSubtagIdsFromWildcardIntoTable(self, file_service_id: int, tag_service_id: int, subtag_wildcard, subtag_id_table_name, job_key=None): if tag_service_id == self.modules_services.combined_tag_service_id: search_tag_service_ids = self.modules_services.GetServiceIds( HC.REAL_TAG_SERVICES) else: search_tag_service_ids = (tag_service_id, ) for search_tag_service_id in search_tag_service_ids: if '*' in subtag_wildcard: subtags_fts4_table_name = self.GetSubtagsFTS4TableName( file_service_id, search_tag_service_id) wildcard_has_fts4_searchable_characters = WildcardHasFTS4SearchableCharacters( subtag_wildcard) if subtag_wildcard == '*': # hellmode, but shouldn't be called normally cursor = self._Execute('SELECT docid FROM {};'.format( subtags_fts4_table_name)) elif ClientSearch.IsComplexWildcard( subtag_wildcard ) or not wildcard_has_fts4_searchable_characters: # FTS4 does not support complex wildcards, so instead we'll search our raw subtags # however, since we want to search 'searchable' text, we use the 'searchable subtags map' to cross between real and searchable like_param = ConvertWildcardToSQLiteLikeParameter( subtag_wildcard) if subtag_wildcard.startswith( '*' ) or not wildcard_has_fts4_searchable_characters: # this is a SCAN, but there we go # a potential optimisation here, in future, is to store fts4 of subtags reversed, then for '*amus', we can just search that reverse cache for 'suma*' # and this would only double the size of the fts4 cache, the largest cache in the whole db! a steal! # it also would not fix '*amu*', but with some cleverness could speed up '*amus ar*' query = 'SELECT docid FROM {} WHERE subtag LIKE ?;'.format( subtags_fts4_table_name) cursor = self._Execute(query, (like_param, )) else: # we have an optimisation here--rather than searching all subtags for bl*ah, let's search all the bl* subtags for bl*ah! prefix_fts4_wildcard = subtag_wildcard.split('*')[0] prefix_fts4_wildcard_param = '"{}*"'.format( prefix_fts4_wildcard) query = 'SELECT docid FROM {} WHERE subtag MATCH ? AND subtag LIKE ?;'.format( subtags_fts4_table_name) cursor = self._Execute( query, (prefix_fts4_wildcard_param, like_param)) else: # we want the " " wrapping our search text to keep whitespace words connected and in order # "samus ar*" should not match "around samus" # simple 'sam*' style subtag, so we can search fts4 no prob subtags_fts4_param = '"{}"'.format(subtag_wildcard) cursor = self._Execute( 'SELECT docid FROM {} WHERE subtag MATCH ?;'.format( subtags_fts4_table_name), (subtags_fts4_param, )) cancelled_hook = None if job_key is not None: cancelled_hook = job_key.IsCancelled loop_of_subtag_id_tuples = HydrusDB.ReadFromCancellableCursor( cursor, 1024, cancelled_hook=cancelled_hook) self._ExecuteMany( 'INSERT OR IGNORE INTO {} ( subtag_id ) VALUES ( ? );'. format(subtag_id_table_name), loop_of_subtag_id_tuples) else: # old notes from before we had searchable subtag map. I deleted that map once, albeit in an older and less efficient form. *don't delete it again, it has use* # # NOTE: doing a subtag = 'blah' lookup on subtags_fts4 tables is ultra slow, lmao! # attempts to match '/a/' to 'a' with clever FTS4 MATCHing (i.e. a MATCH on a*\b, then an '= a') proved not super successful # in testing, it was still a bit slow. my guess is it is still iterating through all the nodes for ^a*, the \b just makes it a bit more efficient sometimes # in tests '^a\b' was about twice as fast as 'a*', so the \b might not even be helping at all # so, I decided to move back to a lean and upgraded searchable subtag map, and here we are searchable_subtag = subtag_wildcard if self.modules_tags.SubtagExists(searchable_subtag): searchable_subtag_id = self.modules_tags.GetSubtagId( searchable_subtag) self._Execute( 'INSERT OR IGNORE INTO {} ( subtag_id ) VALUES ( ? );'. format(subtag_id_table_name), (searchable_subtag_id, )) subtags_searchable_map_table_name = self.GetSubtagsSearchableMapTableName( file_service_id, search_tag_service_id) self._Execute( 'INSERT OR IGNORE INTO {} ( subtag_id ) SELECT subtag_id FROM {} WHERE searchable_subtag_id = ?;' .format(subtag_id_table_name, subtags_searchable_map_table_name), (searchable_subtag_id, )) if job_key is not None and job_key.IsCancelled(): self._Execute('DELETE FROM {};'.format(subtag_id_table_name)) return
def _PopulateHashIdsToHashesCache( self, hash_ids, exception_on_error = False ): if len( self._hash_ids_to_hashes_cache ) > 100000: if not isinstance( hash_ids, set ): hash_ids = set( hash_ids ) self._hash_ids_to_hashes_cache = { hash_id : hash for ( hash_id, hash ) in self._hash_ids_to_hashes_cache.items() if hash_id in hash_ids } uncached_hash_ids = { hash_id for hash_id in hash_ids if hash_id not in self._hash_ids_to_hashes_cache } if len( uncached_hash_ids ) > 0: pubbed_error = False if len( uncached_hash_ids ) == 1: ( uncached_hash_id, ) = uncached_hash_ids rows = self._c.execute( 'SELECT hash_id, hash FROM hashes WHERE hash_id = ?;', ( uncached_hash_id, ) ).fetchall() else: with HydrusDB.TemporaryIntegerTable( self._c, uncached_hash_ids, 'hash_id' ) as temp_table_name: # temp hash_ids to actual hashes rows = self._c.execute( 'SELECT hash_id, hash FROM {} CROSS JOIN hashes USING ( hash_id );'.format( temp_table_name ) ).fetchall() uncached_hash_ids_to_hashes = dict( rows ) if len( uncached_hash_ids_to_hashes ) < len( uncached_hash_ids ): for hash_id in uncached_hash_ids: if hash_id not in uncached_hash_ids_to_hashes: if exception_on_error: raise HydrusExceptions.DataMissing( 'Did not find all entries for those hash ids!' ) HydrusData.DebugPrint( 'Database hash error: hash_id ' + str( hash_id ) + ' was missing!' ) HydrusData.PrintException( Exception( 'Missing file identifier stack trace.' ) ) if not pubbed_error: HydrusData.ShowText( 'A file identifier was missing! This is a serious error that means your client database has an orphan file id! Think about contacting hydrus dev!' ) pubbed_error = True hash = bytes.fromhex( 'aaaaaaaaaaaaaaaa' ) + os.urandom( 16 ) uncached_hash_ids_to_hashes[ hash_id ] = hash self._hash_ids_to_hashes_cache.update( uncached_hash_ids_to_hashes )
def ClearLocalDeleteRecord(self, hash_ids=None): # we delete from everywhere, but not for files currently in the trash service_ids_to_nums_cleared = {} local_non_trash_service_ids = self.modules_services.GetServiceIds( (HC.COMBINED_LOCAL_FILE, HC.LOCAL_FILE_DOMAIN)) if hash_ids is None: trash_current_files_table_name = GenerateFilesTableName( self.modules_services.trash_service_id, HC.CONTENT_STATUS_CURRENT) for service_id in local_non_trash_service_ids: deleted_files_table_name = GenerateFilesTableName( service_id, HC.CONTENT_STATUS_DELETED) self._c.execute( 'DELETE FROM {} WHERE hash_id NOT IN ( SELECT hash_id FROM {} );' .format(deleted_files_table_name, trash_current_files_table_name)) num_cleared = HydrusDB.GetRowCount(self._c) service_ids_to_nums_cleared[service_id] = num_cleared self._c.execute( 'DELETE FROM local_file_deletion_reasons WHERE hash_id NOT IN ( SELECT hash_id FROM {} );' .format(trash_current_files_table_name)) else: trashed_hash_ids = self.FilterCurrentHashIds( self.modules_services.trash_service_id, hash_ids) ok_to_clear_hash_ids = set(hash_ids).difference(trashed_hash_ids) if len(ok_to_clear_hash_ids) > 0: for service_id in local_non_trash_service_ids: deleted_files_table_name = GenerateFilesTableName( service_id, HC.CONTENT_STATUS_DELETED) self._c.executemany( 'DELETE FROM {} WHERE hash_id = ?;'.format( deleted_files_table_name), ((hash_id, ) for hash_id in ok_to_clear_hash_ids)) num_cleared = HydrusDB.GetRowCount(self._c) service_ids_to_nums_cleared[service_id] = num_cleared self._c.executemany( 'DELETE FROM local_file_deletion_reasons WHERE hash_id = ?;', ((hash_id, ) for hash_id in ok_to_clear_hash_ids)) return service_ids_to_nums_cleared
def GetHashIdsAndNonZeroTagCounts( self, tag_display_type: int, location_context: ClientLocation.LocationContext, tag_search_context: ClientSearch.TagSearchContext, hash_ids, namespace_wildcard = None, job_key = None ): if namespace_wildcard == '*': namespace_wildcard = None if namespace_wildcard is None: namespace_ids = [] else: namespace_ids = self.modules_tag_search.GetNamespaceIdsFromWildcard( namespace_wildcard ) with self._MakeTemporaryIntegerTable( namespace_ids, 'namespace_id' ) as temp_namespace_ids_table_name: ( file_service_keys, file_location_is_cross_referenced ) = location_context.GetCoveringCurrentFileServiceKeys() mapping_and_tag_table_names = set() for file_service_key in file_service_keys: mapping_and_tag_table_names.update( self.modules_tag_search.GetMappingAndTagTables( tag_display_type, file_service_key, tag_search_context ) ) # reason why I (JOIN each table) rather than (join the UNION) is based on previous hell with having query planner figure out a "( a UNION b UNION c ) NATURAL JOIN stuff" situation # although the following sometimes makes certifiable 2KB ( 6 UNION * 4-table ) queries, it actually works fast # OK, a new problem is mass UNION leads to terrible cancelability because the first row cannot be fetched until the first n - 1 union queries are done # I tried some gubbins to try to do a pseudo table-union rather than query union and do 'get files->distinct tag count for this union of tables, and fetch hash_ids first on the union', but did not have luck # so NOW we are just going to do it in bits of files mate. this also reduces memory use from the distinct-making UNION with large numbers of hash_ids results = [] BLOCK_SIZE = max( 64, int( len( hash_ids ) ** 0.5 ) ) # go for square root for now for group_of_hash_ids in HydrusData.SplitIteratorIntoChunks( hash_ids, BLOCK_SIZE ): with self._MakeTemporaryIntegerTable( group_of_hash_ids, 'hash_id' ) as hash_ids_table_name: if namespace_wildcard is None: # temp hashes to mappings select_statements = [ 'SELECT hash_id, tag_id FROM {} CROSS JOIN {} USING ( hash_id )'.format( hash_ids_table_name, mappings_table_name ) for ( mappings_table_name, tags_table_name ) in mapping_and_tag_table_names ] else: # temp hashes to mappings to tags to namespaces select_statements = [ 'SELECT hash_id, tag_id FROM {} CROSS JOIN {} USING ( hash_id ) CROSS JOIN {} USING ( tag_id ) CROSS JOIN {} USING ( namespace_id )'.format( hash_ids_table_name, mappings_table_name, tags_table_name, temp_namespace_ids_table_name ) for ( mappings_table_name, tags_table_name ) in mapping_and_tag_table_names ] unions = '( {} )'.format( ' UNION '.join( select_statements ) ) query = 'SELECT hash_id, COUNT( tag_id ) FROM {} GROUP BY hash_id;'.format( unions ) cursor = self._Execute( query ) cancelled_hook = None if job_key is not None: cancelled_hook = job_key.IsCancelled loop_of_results = HydrusDB.ReadFromCancellableCursor( cursor, 64, cancelled_hook = cancelled_hook ) if job_key is not None and job_key.IsCancelled(): return results results.extend( loop_of_results ) return results
def _RegenerateBranch(self, job_key, phash_id): job_key.SetVariable('popup_text_2', 'reviewing existing branch') # grab everything in the branch (parent_id, ) = self._c.execute( 'SELECT parent_id FROM shape_vptree WHERE phash_id = ?;', (phash_id, )).fetchone() cte_table_name = 'branch ( branch_phash_id )' initial_select = 'SELECT ?' recursive_select = 'SELECT phash_id FROM shape_vptree, branch ON parent_id = branch_phash_id' with_clause = 'WITH RECURSIVE ' + cte_table_name + ' AS ( ' + initial_select + ' UNION ALL ' + recursive_select + ')' unbalanced_nodes = self._c.execute( with_clause + ' SELECT branch_phash_id, phash FROM branch, shape_perceptual_hashes ON phash_id = branch_phash_id;', (phash_id, )).fetchall() # removal of old branch, maintenance schedule, and orphan phashes job_key.SetVariable( 'popup_text_2', HydrusData.ToHumanInt(len(unbalanced_nodes)) + ' leaves found--now clearing out old branch') unbalanced_phash_ids = {p_id for (p_id, p_h) in unbalanced_nodes} self._c.executemany('DELETE FROM shape_vptree WHERE phash_id = ?;', ((p_id, ) for p_id in unbalanced_phash_ids)) self._c.executemany( 'DELETE FROM shape_maintenance_branch_regen WHERE phash_id = ?;', ((p_id, ) for p_id in unbalanced_phash_ids)) with HydrusDB.TemporaryIntegerTable( self._c, unbalanced_phash_ids, 'phash_id') as temp_phash_ids_table_name: useful_phash_ids = self._STS( self._c.execute( 'SELECT phash_id FROM {} CROSS JOIN shape_perceptual_hash_map USING ( phash_id );' .format(temp_phash_ids_table_name))) orphan_phash_ids = unbalanced_phash_ids.difference(useful_phash_ids) self._c.executemany( 'DELETE FROM shape_perceptual_hashes WHERE phash_id = ?;', ((p_id, ) for p_id in orphan_phash_ids)) useful_nodes = [ row for row in unbalanced_nodes if row[0] in useful_phash_ids ] useful_population = len(useful_nodes) # now create the new branch, starting by choosing a new root and updating the parent's left/right reference to that if useful_population > 0: (new_phash_id, new_phash) = self._PopBestRootNode( useful_nodes) #HydrusData.RandomPop( useful_nodes ) else: new_phash_id = None if parent_id is not None: (parent_inner_id, ) = self._c.execute( 'SELECT inner_id FROM shape_vptree WHERE phash_id = ?;', (parent_id, )).fetchone() if parent_inner_id == phash_id: query = 'UPDATE shape_vptree SET inner_id = ?, inner_population = ? WHERE phash_id = ?;' else: query = 'UPDATE shape_vptree SET outer_id = ?, outer_population = ? WHERE phash_id = ?;' self._c.execute(query, (new_phash_id, useful_population, parent_id)) if useful_population > 0: self._GenerateBranch(job_key, parent_id, new_phash_id, new_phash, useful_nodes)
def Search(self, hash_id, max_hamming_distance): if max_hamming_distance == 0: similar_hash_ids = self._STL( self._c.execute( 'SELECT hash_id FROM shape_perceptual_hash_map WHERE phash_id IN ( SELECT phash_id FROM shape_perceptual_hash_map WHERE hash_id = ? );', (hash_id, ))) similar_hash_ids_and_distances = [ (similar_hash_id, 0) for similar_hash_id in similar_hash_ids ] else: search_radius = max_hamming_distance top_node_result = self._c.execute( 'SELECT phash_id FROM shape_vptree WHERE parent_id IS NULL;' ).fetchone() if top_node_result is None: return [] (root_node_phash_id, ) = top_node_result search = self._STL( self._c.execute( 'SELECT phash FROM shape_perceptual_hashes NATURAL JOIN shape_perceptual_hash_map WHERE hash_id = ?;', (hash_id, ))) if len(search) == 0: return [] similar_phash_ids_to_distances = {} num_cycles = 0 total_nodes_searched = 0 for search_phash in search: next_potentials = [root_node_phash_id] while len(next_potentials) > 0: current_potentials = next_potentials next_potentials = [] num_cycles += 1 total_nodes_searched += len(current_potentials) for group_of_current_potentials in HydrusData.SplitListIntoChunks( current_potentials, 10000): # this is split into fixed lists of results of subgroups because as an iterable it was causing crashes on linux!! # after investigation, it seemed to be SQLite having a problem with part of Get64BitHammingDistance touching phashes it presumably was still hanging on to # the crash was in sqlite code, again presumably on subsequent fetch # adding a delay in seemed to fix it as well. guess it was some memory maintenance buffer/bytes thing # anyway, we now just get the whole lot of results first and then work on the whole lot ''' #old method select_statement = 'SELECT phash_id, phash, radius, inner_id, outer_id FROM shape_perceptual_hashes NATURAL JOIN shape_vptree WHERE phash_id = ?;' results = list( self._ExecuteManySelectSingleParam( select_statement, group_of_current_potentials ) ) ''' with HydrusDB.TemporaryIntegerTable( self._c, group_of_current_potentials, 'phash_id') as temp_table_name: # temp phash_ids to actual phashes and tree info results = self._c.execute( 'SELECT phash_id, phash, radius, inner_id, outer_id FROM {} CROSS JOIN shape_perceptual_hashes USING ( phash_id ) CROSS JOIN shape_vptree USING ( phash_id );' .format(temp_table_name)).fetchall() for (node_phash_id, node_phash, node_radius, inner_phash_id, outer_phash_id) in results: # first check the node itself--is it similar? node_hamming_distance = HydrusData.Get64BitHammingDistance( search_phash, node_phash) if node_hamming_distance <= search_radius: if node_phash_id in similar_phash_ids_to_distances: current_distance = similar_phash_ids_to_distances[ node_phash_id] similar_phash_ids_to_distances[ node_phash_id] = min( node_hamming_distance, current_distance) else: similar_phash_ids_to_distances[ node_phash_id] = node_hamming_distance # now how about its children? if node_radius is not None: # we have two spheres--node and search--their centers separated by node_hamming_distance # we want to search inside/outside the node_sphere if the search_sphere intersects with those spaces # there are four possibles: # (----N----)-(--S--) intersects with outer only - distance between N and S > their radii # (----N---(-)-S--) intersects with both # (----N-(--S-)-) intersects with both # (---(-N-S--)-) intersects with inner only - distance between N and S + radius_S does not exceed radius_N if inner_phash_id is not None: spheres_disjoint = node_hamming_distance > ( node_radius + search_radius) if not spheres_disjoint: # i.e. they intersect at some point next_potentials.append(inner_phash_id) if outer_phash_id is not None: search_sphere_subset_of_node_sphere = ( node_hamming_distance + search_radius) <= node_radius if not search_sphere_subset_of_node_sphere: # i.e. search sphere intersects with non-node sphere space at some point next_potentials.append(outer_phash_id) if HG.db_report_mode: HydrusData.ShowText( 'Similar file search touched {} nodes over {} cycles.'. format(HydrusData.ToHumanInt(total_nodes_searched), HydrusData.ToHumanInt(num_cycles))) # so, now we have phash_ids and distances. let's map that to actual files. # files can have multiple phashes, and phashes can refer to multiple files, so let's make sure we are setting the smallest distance we found similar_phash_ids = list(similar_phash_ids_to_distances.keys()) with HydrusDB.TemporaryIntegerTable(self._c, similar_phash_ids, 'phash_id') as temp_table_name: # temp phashes to hash map similar_phash_ids_to_hash_ids = HydrusData.BuildKeyToListDict( self._c.execute( 'SELECT phash_id, hash_id FROM {} CROSS JOIN shape_perceptual_hash_map USING ( phash_id );' .format(temp_table_name))) similar_hash_ids_to_distances = {} for (phash_id, hash_ids) in similar_phash_ids_to_hash_ids.items(): distance = similar_phash_ids_to_distances[phash_id] for hash_id in hash_ids: if hash_id not in similar_hash_ids_to_distances: similar_hash_ids_to_distances[hash_id] = distance else: current_distance = similar_hash_ids_to_distances[ hash_id] if distance < current_distance: similar_hash_ids_to_distances[hash_id] = distance similar_hash_ids_and_distances = list( similar_hash_ids_to_distances.items()) return similar_hash_ids_and_distances
def GetHashIdsFromTagIds( self, tag_display_type: int, file_service_key: bytes, tag_search_context: ClientSearch.TagSearchContext, tag_ids: typing.Collection[ int ], hash_ids = None, hash_ids_table_name = None, job_key = None ): do_hash_table_join = False if hash_ids_table_name is not None and hash_ids is not None: tag_service_id = self.modules_services.GetServiceId( tag_search_context.service_key ) file_service_id = self.modules_services.GetServiceId( file_service_key ) estimated_count = self.modules_mappings_counts.GetAutocompleteCountEstimate( tag_display_type, tag_service_id, file_service_id, tag_ids, tag_search_context.include_current_tags, tag_search_context.include_pending_tags ) if ClientDBMappingsStorage.DoingAFileJoinTagSearchIsFaster( len( hash_ids ), estimated_count ): do_hash_table_join = True result_hash_ids = set() table_names = self.modules_tag_search.GetMappingTables( tag_display_type, file_service_key, tag_search_context ) cancelled_hook = None if job_key is not None: cancelled_hook = job_key.IsCancelled if len( tag_ids ) == 1: ( tag_id, ) = tag_ids if do_hash_table_join: # temp hashes to mappings queries = [ 'SELECT hash_id FROM {} CROSS JOIN {} USING ( hash_id ) WHERE tag_id = ?'.format( hash_ids_table_name, table_name ) for table_name in table_names ] else: queries = [ 'SELECT hash_id FROM {} WHERE tag_id = ?;'.format( table_name ) for table_name in table_names ] for query in queries: cursor = self._Execute( query, ( tag_id, ) ) result_hash_ids.update( self._STI( HydrusDB.ReadFromCancellableCursor( cursor, 1024, cancelled_hook ) ) ) else: with self._MakeTemporaryIntegerTable( tag_ids, 'tag_id' ) as temp_tag_ids_table_name: if do_hash_table_join: # temp hashes to mappings to temp tags # old method, does not do EXISTS efficiently, it makes a list instead and checks that # queries = [ 'SELECT hash_id FROM {} WHERE EXISTS ( SELECT 1 FROM {} CROSS JOIN {} USING ( tag_id ) WHERE {}.hash_id = {}.hash_id );'.format( hash_ids_table_name, table_name, temp_tag_ids_table_name, table_name, hash_ids_table_name ) for table_name in table_names ] # new method, this seems to actually do the correlated scalar subquery, although it does seem to be sqlite voodoo queries = [ 'SELECT hash_id FROM {} WHERE EXISTS ( SELECT 1 FROM {} WHERE {}.hash_id = {}.hash_id AND EXISTS ( SELECT 1 FROM {} WHERE {}.tag_id = {}.tag_id ) );'.format( hash_ids_table_name, table_name, table_name, hash_ids_table_name, temp_tag_ids_table_name, table_name, temp_tag_ids_table_name ) for table_name in table_names ] else: # temp tags to mappings queries = [ 'SELECT hash_id FROM {} CROSS JOIN {} USING ( tag_id );'.format( temp_tag_ids_table_name, table_name ) for table_name in table_names ] for query in queries: cursor = self._Execute( query ) result_hash_ids.update( self._STI( HydrusDB.ReadFromCancellableCursor( cursor, 1024, cancelled_hook ) ) ) return result_hash_ids