def ConvertAllParseResultsToFileSeeds( all_parse_results, source_url, file_import_options ): file_seeds = [] seen_urls = set() for parse_results in all_parse_results: parsed_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_DESIRED, ), only_get_top_priority = True ) parsed_urls = HydrusData.DedupeList( parsed_urls ) parsed_urls = [ url for url in parsed_urls if url not in seen_urls ] seen_urls.update( parsed_urls ) # note we do this recursively due to parse_results being appropriate only for these urls--don't move this out again, or tags will be messed up for url in parsed_urls: file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url ) file_seed.SetReferralURL( source_url ) file_seed.AddParseResults( parse_results, file_import_options ) file_seeds.append( file_seed ) return file_seeds
def ForceLogins(self, domains_to_login): with self._lock: self._domains_to_login.extend(domains_to_login) self._domains_to_login = HydrusData.DedupeList( self._domains_to_login)
def ForceLogins(self, domains_to_login: typing.Collection[str]): with self._lock: self._domains_to_login.extend(domains_to_login) self._domains_to_login = HydrusData.DedupeList( self._domains_to_login)
def _ImportURLs( self, urls ): gallery_seed_log = self._gallery_seed_log_get_callable() urls = HydrusData.DedupeList( urls ) filtered_urls = [ url for url in urls if not gallery_seed_log.HasGalleryURL( url ) ] urls_to_add = urls if len( filtered_urls ) < len( urls ): num_urls = len( urls ) num_removed = num_urls - len( filtered_urls ) message = 'Of the ' + HydrusData.ToHumanInt( num_urls ) + ' URLs you mean to add, ' + HydrusData.ToHumanInt( num_removed ) + ' are already in the search log. Would you like to only add new URLs or add everything (which will force a re-check of the duplicates)?' ( result, was_cancelled ) = ClientGUIDialogsQuick.GetYesNo( self, message, yes_label = 'only add new urls', no_label = 'add all urls, even duplicates', check_for_cancelled = True ) if was_cancelled: return if result == QW.QDialog.Accepted: urls_to_add = filtered_urls elif result == QW.QDialog.Rejected: return can_generate_more_pages = False if self._can_generate_more_pages: message = 'Would you like these urls to only check for new files, or would you like them to also generate subsequent gallery pages, like a regular search would?' ( result, was_cancelled ) = ClientGUIDialogsQuick.GetYesNo( self, message, yes_label = 'just check what I am adding', no_label = 'start a potential new search for every url added', check_for_cancelled = True ) if was_cancelled: return can_generate_more_pages = result == QW.QDialog.Rejected gallery_seeds = [ ClientImportGallerySeeds.GallerySeed( url, can_generate_more_pages = can_generate_more_pages ) for url in urls_to_add ] gallery_seed_log.AddGallerySeeds( gallery_seeds )
def WorkOnURL( self, gallery_token_name, gallery_seed_log, file_seeds_callable, status_hook, title_hook, network_job_factory, network_job_presentation_context_factory, file_import_options, gallery_urls_seen_before = None ): if gallery_urls_seen_before is None: gallery_urls_seen_before = set() gallery_urls_seen_before.add( self.url ) # maybe something like 'append urls' vs 'reverse-prepend' for subs or something # should also take--and populate--a set of urls we have seen this 'run', so we can bomb out if next_gallery_url ends up in some loop num_urls_added = 0 num_urls_already_in_file_seed_cache = 0 num_urls_total = 0 result_404 = False added_new_gallery_pages = False stop_reason = '' try: gallery_url = self.url url_for_child_referral = gallery_url ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( gallery_url ) if url_type not in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_WATCHABLE ): raise HydrusExceptions.VetoException( 'Did not recognise this as a gallery or watchable URL!' ) if not can_parse: raise HydrusExceptions.VetoException( 'Cannot parse {}: {}'.format( match_name, cannot_parse_reason) ) ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url ) status_hook( 'downloading gallery page' ) if self._referral_url is not None and self._referral_url != url_to_check: referral_url = self._referral_url elif gallery_url != url_to_check: referral_url = gallery_url else: referral_url = None network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url ) network_job.SetGalleryToken( gallery_token_name ) network_job.OverrideBandwidth( 30 ) HG.client_controller.network_engine.AddJob( network_job ) with network_job_presentation_context_factory( network_job ) as njpc: network_job.WaitUntilDone() parsing_text = network_job.GetContentText() actual_fetched_url = network_job.GetActualFetchedURL() do_parse = True if actual_fetched_url != url_to_check: ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url ) if url_type == HC.URL_TYPE_GALLERY: if can_parse: gallery_url = actual_fetched_url url_for_child_referral = gallery_url ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url ) else: do_parse = False status = CC.STATUS_ERROR note = 'Could not parse {}: {}'.format( match_name, cannot_parse_reason ) else: do_parse = False from hydrus.client.importing import ClientImportFileSeeds file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, actual_fetched_url ) file_seed.SetReferralURL( url_for_child_referral ) file_seeds = [ file_seed ] file_seeds_callable( ( file_seed, ) ) status = CC.STATUS_SUCCESSFUL_AND_NEW note = 'was redirected to a non-gallery url, which has been queued as a file import' if do_parse: parsing_context = {} parsing_context[ 'gallery_url' ] = gallery_url parsing_context[ 'url' ] = url_to_check parsing_context[ 'post_index' ] = '0' all_parse_results = parser.Parse( parsing_context, parsing_text ) if len( all_parse_results ) == 0: raise HydrusExceptions.VetoException( 'The parser found nothing in the document!' ) file_seeds = ClientImporting.ConvertAllParseResultsToFileSeeds( all_parse_results, url_for_child_referral, file_import_options ) title = ClientParsing.GetTitleFromAllParseResults( all_parse_results ) if title is not None: title_hook( title ) for file_seed in file_seeds: file_seed.SetExternalFilterableTags( self._external_filterable_tags ) file_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags ) num_urls_total = len( file_seeds ) ( num_urls_added, num_urls_already_in_file_seed_cache, can_search_for_more_files, stop_reason ) = file_seeds_callable( file_seeds ) status = CC.STATUS_SUCCESSFUL_AND_NEW note = HydrusData.ToHumanInt( num_urls_added ) + ' new urls found' if num_urls_already_in_file_seed_cache > 0: note += ' (' + HydrusData.ToHumanInt( num_urls_already_in_file_seed_cache ) + ' of page already in)' if not can_search_for_more_files: note += ' - ' + stop_reason if parser.CanOnlyGenerateGalleryURLs() or self._force_next_page_url_generation: can_add_more_gallery_urls = True else: # only keep searching if we found any files, otherwise this could be a blank results page with another stub page can_add_more_gallery_urls = num_urls_added > 0 and can_search_for_more_files flattened_results = list( itertools.chain.from_iterable( all_parse_results ) ) sub_gallery_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_SUB_GALLERY, ), only_get_top_priority = True ) sub_gallery_urls = HydrusData.DedupeList( sub_gallery_urls ) new_sub_gallery_urls = [ sub_gallery_url for sub_gallery_url in sub_gallery_urls if sub_gallery_url not in gallery_urls_seen_before ] num_new_sub_gallery_urls = len( new_sub_gallery_urls ) if num_new_sub_gallery_urls > 0: sub_gallery_seeds = [ GallerySeed( sub_gallery_url ) for sub_gallery_url in new_sub_gallery_urls ] for sub_gallery_seed in sub_gallery_seeds: sub_gallery_seed.SetRunToken( self._run_token ) sub_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags ) sub_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags ) gallery_seed_log.AddGallerySeeds( sub_gallery_seeds ) added_new_gallery_pages = True gallery_urls_seen_before.update( sub_gallery_urls ) note += ' - {} sub-gallery urls found'.format( HydrusData.ToHumanInt( num_new_sub_gallery_urls ) ) if self._can_generate_more_pages and can_add_more_gallery_urls: next_page_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_NEXT, ), only_get_top_priority = True ) if self.url in next_page_urls: next_page_urls.remove( self.url ) if url_to_check in next_page_urls: next_page_urls.remove( url_to_check ) if len( next_page_urls ) > 0: next_page_generation_phrase = ' next gallery pages found' else: # we have failed to parse a next page url, but we would still like one, so let's see if the url match can provide one url_class = HG.client_controller.network_engine.domain_manager.GetURLClass( url_to_check ) if url_class is not None and url_class.CanGenerateNextGalleryPage(): try: next_page_url = url_class.GetNextGalleryPage( url_to_check ) next_page_urls = [ next_page_url ] except Exception as e: note += ' - Attempted to generate a next gallery page url, but failed!' note += os.linesep note += traceback.format_exc() next_page_generation_phrase = ' next gallery pages extrapolated from url class' if len( next_page_urls ) > 0: next_page_urls = HydrusData.DedupeList( next_page_urls ) new_next_page_urls = [ next_page_url for next_page_url in next_page_urls if next_page_url not in gallery_urls_seen_before ] duplicate_next_page_urls = gallery_urls_seen_before.intersection( new_next_page_urls ) num_new_next_page_urls = len( new_next_page_urls ) num_dupe_next_page_urls = len( duplicate_next_page_urls ) if num_new_next_page_urls > 0: next_gallery_seeds = [ GallerySeed( next_page_url ) for next_page_url in new_next_page_urls ] for next_gallery_seed in next_gallery_seeds: next_gallery_seed.SetRunToken( self._run_token ) next_gallery_seed.SetReferralURL( url_for_child_referral ) next_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags ) next_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags ) gallery_seed_log.AddGallerySeeds( next_gallery_seeds ) added_new_gallery_pages = True gallery_urls_seen_before.update( new_next_page_urls ) if num_dupe_next_page_urls == 0: note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase else: note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase + ', but ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' had already been visited this run and were not added' else: note += ' - ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + next_page_generation_phrase + ', but they had already been visited this run and were not added' self.SetStatus( status, note = note ) except HydrusExceptions.ShutdownException: pass except HydrusExceptions.VetoException as e: status = CC.STATUS_VETOED note = str( e ) self.SetStatus( status, note = note ) if isinstance( e, HydrusExceptions.CancelledException ): status_hook( 'cancelled!' ) time.sleep( 2 ) except HydrusExceptions.InsufficientCredentialsException: status = CC.STATUS_VETOED note = '403' self.SetStatus( status, note = note ) status_hook( '403' ) time.sleep( 2 ) result_404 = True except HydrusExceptions.NotFoundException: status = CC.STATUS_VETOED note = '404' self.SetStatus( status, note = note ) status_hook( '404' ) time.sleep( 2 ) result_404 = True except Exception as e: status = CC.STATUS_ERROR self.SetStatus( status, exception = e ) status_hook( 'error!' ) time.sleep( 3 ) if isinstance( e, HydrusExceptions.NetworkException ): # so the larger queue can set a delaywork or whatever raise finally: gallery_seed_log.NotifyGallerySeedsUpdated( ( self, ) ) return ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total, result_404, added_new_gallery_pages, stop_reason )
def Search(self, hash_id, max_hamming_distance): similar_hash_ids_and_distances = [] result = self._Execute( 'SELECT pixel_hash_id FROM pixel_hash_map WHERE hash_id = ?;', (hash_id, )).fetchone() if result is not None: (pixel_hash_id, ) = result pixel_dupe_hash_ids = self._STL( self._Execute( 'SELECT hash_id FROM pixel_hash_map WHERE pixel_hash_id = ? AND hash_id != ?;', (pixel_hash_id, hash_id))) similar_hash_ids_and_distances = [ (pixel_dupe_hash_id, 0) for pixel_dupe_hash_id in pixel_dupe_hash_ids ] if max_hamming_distance == 0: similar_hash_ids = self._STL( self._Execute( 'SELECT hash_id FROM shape_perceptual_hash_map WHERE phash_id IN ( SELECT phash_id FROM shape_perceptual_hash_map WHERE hash_id = ? );', (hash_id, ))) similar_hash_ids_and_distances.extend([ (similar_hash_id, 0) for similar_hash_id in similar_hash_ids ]) else: search_radius = max_hamming_distance top_node_result = self._Execute( 'SELECT phash_id FROM shape_vptree WHERE parent_id IS NULL;' ).fetchone() if top_node_result is None: return similar_hash_ids_and_distances (root_node_perceptual_hash_id, ) = top_node_result search = self._STL( self._Execute( 'SELECT phash FROM shape_perceptual_hashes NATURAL JOIN shape_perceptual_hash_map WHERE hash_id = ?;', (hash_id, ))) if len(search) == 0: return similar_hash_ids_and_distances similar_perceptual_hash_ids_to_distances = {} num_cycles = 0 total_nodes_searched = 0 for search_perceptual_hash in search: next_potentials = [root_node_perceptual_hash_id] while len(next_potentials) > 0: current_potentials = next_potentials next_potentials = [] num_cycles += 1 total_nodes_searched += len(current_potentials) for group_of_current_potentials in HydrusData.SplitListIntoChunks( current_potentials, 10000): # this is split into fixed lists of results of subgroups because as an iterable it was causing crashes on linux!! # after investigation, it seemed to be SQLite having a problem with part of Get64BitHammingDistance touching phashes it presumably was still hanging on to # the crash was in sqlite code, again presumably on subsequent fetch # adding a delay in seemed to fix it as well. guess it was some memory maintenance buffer/bytes thing # anyway, we now just get the whole lot of results first and then work on the whole lot with self._MakeTemporaryIntegerTable( group_of_current_potentials, 'phash_id') as temp_table_name: # temp phash_ids to actual phashes and tree info results = self._Execute( 'SELECT phash_id, phash, radius, inner_id, outer_id FROM {} CROSS JOIN shape_perceptual_hashes USING ( phash_id ) CROSS JOIN shape_vptree USING ( phash_id );' .format(temp_table_name)).fetchall() for (node_perceptual_hash_id, node_perceptual_hash, node_radius, inner_perceptual_hash_id, outer_perceptual_hash_id) in results: # first check the node itself--is it similar? node_hamming_distance = HydrusData.Get64BitHammingDistance( search_perceptual_hash, node_perceptual_hash) if node_hamming_distance <= search_radius: if node_perceptual_hash_id in similar_perceptual_hash_ids_to_distances: current_distance = similar_perceptual_hash_ids_to_distances[ node_perceptual_hash_id] similar_perceptual_hash_ids_to_distances[ node_perceptual_hash_id] = min( node_hamming_distance, current_distance) else: similar_perceptual_hash_ids_to_distances[ node_perceptual_hash_id] = node_hamming_distance # now how about its children? if node_radius is not None: # we have two spheres--node and search--their centers separated by node_hamming_distance # we want to search inside/outside the node_sphere if the search_sphere intersects with those spaces # there are four possibles: # (----N----)-(--S--) intersects with outer only - distance between N and S > their radii # (----N---(-)-S--) intersects with both # (----N-(--S-)-) intersects with both # (---(-N-S--)-) intersects with inner only - distance between N and S + radius_S does not exceed radius_N if inner_perceptual_hash_id is not None: spheres_disjoint = node_hamming_distance > ( node_radius + search_radius) if not spheres_disjoint: # i.e. they intersect at some point next_potentials.append( inner_perceptual_hash_id) if outer_perceptual_hash_id is not None: search_sphere_subset_of_node_sphere = ( node_hamming_distance + search_radius) <= node_radius if not search_sphere_subset_of_node_sphere: # i.e. search sphere intersects with non-node sphere space at some point next_potentials.append( outer_perceptual_hash_id) if HG.db_report_mode: HydrusData.ShowText( 'Similar file search touched {} nodes over {} cycles.'. format(HydrusData.ToHumanInt(total_nodes_searched), HydrusData.ToHumanInt(num_cycles))) # so, now we have phash_ids and distances. let's map that to actual files. # files can have multiple phashes, and phashes can refer to multiple files, so let's make sure we are setting the smallest distance we found similar_perceptual_hash_ids = list( similar_perceptual_hash_ids_to_distances.keys()) with self._MakeTemporaryIntegerTable( similar_perceptual_hash_ids, 'phash_id') as temp_table_name: # temp phashes to hash map similar_perceptual_hash_ids_to_hash_ids = HydrusData.BuildKeyToListDict( self._Execute( 'SELECT phash_id, hash_id FROM {} CROSS JOIN shape_perceptual_hash_map USING ( phash_id );' .format(temp_table_name))) similar_hash_ids_to_distances = {} for (perceptual_hash_id, hash_ids) in similar_perceptual_hash_ids_to_hash_ids.items(): distance = similar_perceptual_hash_ids_to_distances[ perceptual_hash_id] for hash_id in hash_ids: if hash_id not in similar_hash_ids_to_distances: similar_hash_ids_to_distances[hash_id] = distance else: current_distance = similar_hash_ids_to_distances[ hash_id] if distance < current_distance: similar_hash_ids_to_distances[hash_id] = distance similar_hash_ids_and_distances.extend( similar_hash_ids_to_distances.items()) similar_hash_ids_and_distances = HydrusData.DedupeList( similar_hash_ids_and_distances) return similar_hash_ids_and_distances