Exemple #1
0
def ConvertAllParseResultsToFileSeeds( all_parse_results, source_url, file_import_options ):
    
    file_seeds = []
    
    seen_urls = set()
    
    for parse_results in all_parse_results:
        
        parsed_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_DESIRED, ), only_get_top_priority = True )
        
        parsed_urls = HydrusData.DedupeList( parsed_urls )
        
        parsed_urls = [ url for url in parsed_urls if url not in seen_urls ]
        
        seen_urls.update( parsed_urls )
        
        # note we do this recursively due to parse_results being appropriate only for these urls--don't move this out again, or tags will be messed up
        
        for url in parsed_urls:
            
            file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url )
            
            file_seed.SetReferralURL( source_url )
            
            file_seed.AddParseResults( parse_results, file_import_options )
            
            file_seeds.append( file_seed )
            
        
    
    return file_seeds
Exemple #2
0
    def ForceLogins(self, domains_to_login):

        with self._lock:

            self._domains_to_login.extend(domains_to_login)

            self._domains_to_login = HydrusData.DedupeList(
                self._domains_to_login)
Exemple #3
0
    def ForceLogins(self, domains_to_login: typing.Collection[str]):

        with self._lock:

            self._domains_to_login.extend(domains_to_login)

            self._domains_to_login = HydrusData.DedupeList(
                self._domains_to_login)
Exemple #4
0
 def _ImportURLs( self, urls ):
     
     gallery_seed_log = self._gallery_seed_log_get_callable()
     
     urls = HydrusData.DedupeList( urls )
     
     filtered_urls = [ url for url in urls if not gallery_seed_log.HasGalleryURL( url ) ]
     
     urls_to_add = urls
     
     if len( filtered_urls ) < len( urls ):
         
         num_urls = len( urls )
         num_removed = num_urls - len( filtered_urls )
         
         message = 'Of the ' + HydrusData.ToHumanInt( num_urls ) + ' URLs you mean to add, ' + HydrusData.ToHumanInt( num_removed ) + ' are already in the search log. Would you like to only add new URLs or add everything (which will force a re-check of the duplicates)?'
         
         ( result, was_cancelled ) = ClientGUIDialogsQuick.GetYesNo( self, message, yes_label = 'only add new urls', no_label = 'add all urls, even duplicates', check_for_cancelled = True )
         
         if was_cancelled:
             
             return
             
         
         if result == QW.QDialog.Accepted:
             
             urls_to_add = filtered_urls
             
         elif result == QW.QDialog.Rejected:
             
             return
             
         
     
     can_generate_more_pages = False
     
     if self._can_generate_more_pages:
         
         message = 'Would you like these urls to only check for new files, or would you like them to also generate subsequent gallery pages, like a regular search would?'
         
         ( result, was_cancelled ) = ClientGUIDialogsQuick.GetYesNo( self, message, yes_label = 'just check what I am adding', no_label = 'start a potential new search for every url added', check_for_cancelled = True )
         
         if was_cancelled:
             
             return
             
         
         can_generate_more_pages = result == QW.QDialog.Rejected
         
     
     gallery_seeds = [ ClientImportGallerySeeds.GallerySeed( url, can_generate_more_pages = can_generate_more_pages ) for url in urls_to_add ]
     
     gallery_seed_log.AddGallerySeeds( gallery_seeds )
 def WorkOnURL( self, gallery_token_name, gallery_seed_log, file_seeds_callable, status_hook, title_hook, network_job_factory, network_job_presentation_context_factory, file_import_options, gallery_urls_seen_before = None ):
     
     if gallery_urls_seen_before is None:
         
         gallery_urls_seen_before = set()
         
     
     gallery_urls_seen_before.add( self.url )
     
     # maybe something like 'append urls' vs 'reverse-prepend' for subs or something
     
     # should also take--and populate--a set of urls we have seen this 'run', so we can bomb out if next_gallery_url ends up in some loop
     
     num_urls_added = 0
     num_urls_already_in_file_seed_cache = 0
     num_urls_total = 0
     result_404 = False
     added_new_gallery_pages = False
     stop_reason = ''
     
     try:
         
         gallery_url = self.url
         
         url_for_child_referral = gallery_url
         
         ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( gallery_url )
         
         if url_type not in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_WATCHABLE ):
             
             raise HydrusExceptions.VetoException( 'Did not recognise this as a gallery or watchable URL!' )
             
         
         if not can_parse:
             
             raise HydrusExceptions.VetoException( 'Cannot parse {}: {}'.format( match_name, cannot_parse_reason) )
             
         
         ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url )
         
         status_hook( 'downloading gallery page' )
         
         if self._referral_url is not None and self._referral_url != url_to_check:
             
             referral_url = self._referral_url
             
         elif gallery_url != url_to_check:
             
             referral_url = gallery_url
             
         else:
             
             referral_url = None
             
         
         network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url )
         
         network_job.SetGalleryToken( gallery_token_name )
         
         network_job.OverrideBandwidth( 30 )
         
         HG.client_controller.network_engine.AddJob( network_job )
         
         with network_job_presentation_context_factory( network_job ) as njpc:
             
             network_job.WaitUntilDone()
             
         
         parsing_text = network_job.GetContentText()
         
         actual_fetched_url = network_job.GetActualFetchedURL()
         
         do_parse = True
         
         if actual_fetched_url != url_to_check:
             
             ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url )
             
             if url_type == HC.URL_TYPE_GALLERY:
                 
                 if can_parse:
                     
                     gallery_url = actual_fetched_url
                     
                     url_for_child_referral = gallery_url
                     
                     ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url )
                     
                 else:
                     
                     do_parse = False
                     
                     status = CC.STATUS_ERROR
                     
                     note = 'Could not parse {}: {}'.format( match_name, cannot_parse_reason )
                     
                 
             else:
                 
                 do_parse = False
                 
                 from hydrus.client.importing import ClientImportFileSeeds
                 
                 file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, actual_fetched_url )
                 
                 file_seed.SetReferralURL( url_for_child_referral )
                 
                 file_seeds = [ file_seed ]
                 
                 file_seeds_callable( ( file_seed, ) )
                 
                 status = CC.STATUS_SUCCESSFUL_AND_NEW
                 
                 note = 'was redirected to a non-gallery url, which has been queued as a file import'
                 
             
         
         if do_parse:
             
             parsing_context = {}
             
             parsing_context[ 'gallery_url' ] = gallery_url
             parsing_context[ 'url' ] = url_to_check
             parsing_context[ 'post_index' ] = '0'
             
             all_parse_results = parser.Parse( parsing_context, parsing_text )
             
             if len( all_parse_results ) == 0:
                 
                 raise HydrusExceptions.VetoException( 'The parser found nothing in the document!' )
                 
             
             file_seeds = ClientImporting.ConvertAllParseResultsToFileSeeds( all_parse_results, url_for_child_referral, file_import_options )
             
             title = ClientParsing.GetTitleFromAllParseResults( all_parse_results )
             
             if title is not None:
                 
                 title_hook( title )
                 
             
             for file_seed in file_seeds:
                 
                 file_seed.SetExternalFilterableTags( self._external_filterable_tags )
                 file_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags )
                 
             
             num_urls_total = len( file_seeds )
             
             ( num_urls_added, num_urls_already_in_file_seed_cache, can_search_for_more_files, stop_reason ) = file_seeds_callable( file_seeds )
             
             status = CC.STATUS_SUCCESSFUL_AND_NEW
             
             note = HydrusData.ToHumanInt( num_urls_added ) + ' new urls found'
             
             if num_urls_already_in_file_seed_cache > 0:
                 
                 note += ' (' + HydrusData.ToHumanInt( num_urls_already_in_file_seed_cache ) + ' of page already in)'
                 
             
             if not can_search_for_more_files:
                 
                 note += ' - ' + stop_reason
                 
             
             if parser.CanOnlyGenerateGalleryURLs() or self._force_next_page_url_generation:
                 
                 can_add_more_gallery_urls = True
                 
             else:
                 
                 # only keep searching if we found any files, otherwise this could be a blank results page with another stub page
                 can_add_more_gallery_urls = num_urls_added > 0 and can_search_for_more_files
                 
             
             flattened_results = list( itertools.chain.from_iterable( all_parse_results ) )
             
             sub_gallery_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_SUB_GALLERY, ), only_get_top_priority = True )
             
             sub_gallery_urls = HydrusData.DedupeList( sub_gallery_urls )
             
             new_sub_gallery_urls = [ sub_gallery_url for sub_gallery_url in sub_gallery_urls if sub_gallery_url not in gallery_urls_seen_before ]
             
             num_new_sub_gallery_urls = len( new_sub_gallery_urls )
             
             if num_new_sub_gallery_urls > 0:
                 
                 sub_gallery_seeds = [ GallerySeed( sub_gallery_url ) for sub_gallery_url in new_sub_gallery_urls ]
                 
                 for sub_gallery_seed in sub_gallery_seeds:
                     
                     sub_gallery_seed.SetRunToken( self._run_token )
                     sub_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags )
                     sub_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags )
                     
                 
                 gallery_seed_log.AddGallerySeeds( sub_gallery_seeds )
                 
                 added_new_gallery_pages = True
                 
                 gallery_urls_seen_before.update( sub_gallery_urls )
                 
                 note += ' - {} sub-gallery urls found'.format( HydrusData.ToHumanInt( num_new_sub_gallery_urls ) )
                 
             
             if self._can_generate_more_pages and can_add_more_gallery_urls:
                 
                 next_page_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_NEXT, ), only_get_top_priority = True )
                 
                 if self.url in next_page_urls:
                     
                     next_page_urls.remove( self.url )
                     
                 
                 if url_to_check in next_page_urls:
                     
                     next_page_urls.remove( url_to_check )
                     
                 
                 if len( next_page_urls ) > 0:
                     
                     next_page_generation_phrase = ' next gallery pages found'
                     
                 else:
                     
                     # we have failed to parse a next page url, but we would still like one, so let's see if the url match can provide one
                     
                     url_class = HG.client_controller.network_engine.domain_manager.GetURLClass( url_to_check )
                     
                     if url_class is not None and url_class.CanGenerateNextGalleryPage():
                         
                         try:
                             
                             next_page_url = url_class.GetNextGalleryPage( url_to_check )
                             
                             next_page_urls = [ next_page_url ]
                             
                         except Exception as e:
                             
                             note += ' - Attempted to generate a next gallery page url, but failed!'
                             note += os.linesep
                             note += traceback.format_exc()
                             
                         
                     
                     next_page_generation_phrase = ' next gallery pages extrapolated from url class'
                     
                 
                 if len( next_page_urls ) > 0:
                     
                     next_page_urls = HydrusData.DedupeList( next_page_urls )
                     
                     new_next_page_urls = [ next_page_url for next_page_url in next_page_urls if next_page_url not in gallery_urls_seen_before ]
                     
                     duplicate_next_page_urls = gallery_urls_seen_before.intersection( new_next_page_urls )
                     
                     num_new_next_page_urls = len( new_next_page_urls )
                     num_dupe_next_page_urls = len( duplicate_next_page_urls )
                     
                     if num_new_next_page_urls > 0:
                         
                         next_gallery_seeds = [ GallerySeed( next_page_url ) for next_page_url in new_next_page_urls ]
                         
                         for next_gallery_seed in next_gallery_seeds:
                             
                             next_gallery_seed.SetRunToken( self._run_token )
                             next_gallery_seed.SetReferralURL( url_for_child_referral )
                             next_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags )
                             next_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags )
                             
                         
                         gallery_seed_log.AddGallerySeeds( next_gallery_seeds )
                         
                         added_new_gallery_pages = True
                         
                         gallery_urls_seen_before.update( new_next_page_urls )
                         
                         if num_dupe_next_page_urls == 0:
                             
                             note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase
                             
                         else:
                             
                             note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase + ', but ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' had already been visited this run and were not added'
                             
                         
                     else:
                         
                         note += ' - ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + next_page_generation_phrase + ', but they had already been visited this run and were not added'
                         
                     
                 
             
         
         self.SetStatus( status, note = note )
         
     except HydrusExceptions.ShutdownException:
         
         pass
         
     except HydrusExceptions.VetoException as e:
         
         status = CC.STATUS_VETOED
         
         note = str( e )
         
         self.SetStatus( status, note = note )
         
         if isinstance( e, HydrusExceptions.CancelledException ):
             
             status_hook( 'cancelled!' )
             
             time.sleep( 2 )
             
         
     except HydrusExceptions.InsufficientCredentialsException:
         
         status = CC.STATUS_VETOED
         note = '403'
         
         self.SetStatus( status, note = note )
         
         status_hook( '403' )
         
         time.sleep( 2 )
         
         result_404 = True
         
     except HydrusExceptions.NotFoundException:
         
         status = CC.STATUS_VETOED
         note = '404'
         
         self.SetStatus( status, note = note )
         
         status_hook( '404' )
         
         time.sleep( 2 )
         
         result_404 = True
         
     except Exception as e:
         
         status = CC.STATUS_ERROR
         
         self.SetStatus( status, exception = e )
         
         status_hook( 'error!' )
         
         time.sleep( 3 )
         
         if isinstance( e, HydrusExceptions.NetworkException ): # so the larger queue can set a delaywork or whatever
             
             raise
             
         
     finally:
         
         gallery_seed_log.NotifyGallerySeedsUpdated( ( self, ) )
         
     
     return ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total, result_404, added_new_gallery_pages, stop_reason )
    def Search(self, hash_id, max_hamming_distance):

        similar_hash_ids_and_distances = []

        result = self._Execute(
            'SELECT pixel_hash_id FROM pixel_hash_map WHERE hash_id = ?;',
            (hash_id, )).fetchone()

        if result is not None:

            (pixel_hash_id, ) = result

            pixel_dupe_hash_ids = self._STL(
                self._Execute(
                    'SELECT hash_id FROM pixel_hash_map WHERE pixel_hash_id = ? AND hash_id != ?;',
                    (pixel_hash_id, hash_id)))

            similar_hash_ids_and_distances = [
                (pixel_dupe_hash_id, 0)
                for pixel_dupe_hash_id in pixel_dupe_hash_ids
            ]

        if max_hamming_distance == 0:

            similar_hash_ids = self._STL(
                self._Execute(
                    'SELECT hash_id FROM shape_perceptual_hash_map WHERE phash_id IN ( SELECT phash_id FROM shape_perceptual_hash_map WHERE hash_id = ? );',
                    (hash_id, )))

            similar_hash_ids_and_distances.extend([
                (similar_hash_id, 0) for similar_hash_id in similar_hash_ids
            ])

        else:

            search_radius = max_hamming_distance

            top_node_result = self._Execute(
                'SELECT phash_id FROM shape_vptree WHERE parent_id IS NULL;'
            ).fetchone()

            if top_node_result is None:

                return similar_hash_ids_and_distances

            (root_node_perceptual_hash_id, ) = top_node_result

            search = self._STL(
                self._Execute(
                    'SELECT phash FROM shape_perceptual_hashes NATURAL JOIN shape_perceptual_hash_map WHERE hash_id = ?;',
                    (hash_id, )))

            if len(search) == 0:

                return similar_hash_ids_and_distances

            similar_perceptual_hash_ids_to_distances = {}

            num_cycles = 0
            total_nodes_searched = 0

            for search_perceptual_hash in search:

                next_potentials = [root_node_perceptual_hash_id]

                while len(next_potentials) > 0:

                    current_potentials = next_potentials
                    next_potentials = []

                    num_cycles += 1
                    total_nodes_searched += len(current_potentials)

                    for group_of_current_potentials in HydrusData.SplitListIntoChunks(
                            current_potentials, 10000):

                        # this is split into fixed lists of results of subgroups because as an iterable it was causing crashes on linux!!
                        # after investigation, it seemed to be SQLite having a problem with part of Get64BitHammingDistance touching phashes it presumably was still hanging on to
                        # the crash was in sqlite code, again presumably on subsequent fetch
                        # adding a delay in seemed to fix it as well. guess it was some memory maintenance buffer/bytes thing
                        # anyway, we now just get the whole lot of results first and then work on the whole lot

                        with self._MakeTemporaryIntegerTable(
                                group_of_current_potentials,
                                'phash_id') as temp_table_name:

                            # temp phash_ids to actual phashes and tree info
                            results = self._Execute(
                                'SELECT phash_id, phash, radius, inner_id, outer_id FROM {} CROSS JOIN shape_perceptual_hashes USING ( phash_id ) CROSS JOIN shape_vptree USING ( phash_id );'
                                .format(temp_table_name)).fetchall()

                        for (node_perceptual_hash_id, node_perceptual_hash,
                             node_radius, inner_perceptual_hash_id,
                             outer_perceptual_hash_id) in results:

                            # first check the node itself--is it similar?

                            node_hamming_distance = HydrusData.Get64BitHammingDistance(
                                search_perceptual_hash, node_perceptual_hash)

                            if node_hamming_distance <= search_radius:

                                if node_perceptual_hash_id in similar_perceptual_hash_ids_to_distances:

                                    current_distance = similar_perceptual_hash_ids_to_distances[
                                        node_perceptual_hash_id]

                                    similar_perceptual_hash_ids_to_distances[
                                        node_perceptual_hash_id] = min(
                                            node_hamming_distance,
                                            current_distance)

                                else:

                                    similar_perceptual_hash_ids_to_distances[
                                        node_perceptual_hash_id] = node_hamming_distance

                            # now how about its children?

                            if node_radius is not None:

                                # we have two spheres--node and search--their centers separated by node_hamming_distance
                                # we want to search inside/outside the node_sphere if the search_sphere intersects with those spaces
                                # there are four possibles:
                                # (----N----)-(--S--)    intersects with outer only - distance between N and S > their radii
                                # (----N---(-)-S--)      intersects with both
                                # (----N-(--S-)-)        intersects with both
                                # (---(-N-S--)-)         intersects with inner only - distance between N and S + radius_S does not exceed radius_N

                                if inner_perceptual_hash_id is not None:

                                    spheres_disjoint = node_hamming_distance > (
                                        node_radius + search_radius)

                                    if not spheres_disjoint:  # i.e. they intersect at some point

                                        next_potentials.append(
                                            inner_perceptual_hash_id)

                                if outer_perceptual_hash_id is not None:

                                    search_sphere_subset_of_node_sphere = (
                                        node_hamming_distance +
                                        search_radius) <= node_radius

                                    if not search_sphere_subset_of_node_sphere:  # i.e. search sphere intersects with non-node sphere space at some point

                                        next_potentials.append(
                                            outer_perceptual_hash_id)

            if HG.db_report_mode:

                HydrusData.ShowText(
                    'Similar file search touched {} nodes over {} cycles.'.
                    format(HydrusData.ToHumanInt(total_nodes_searched),
                           HydrusData.ToHumanInt(num_cycles)))

            # so, now we have phash_ids and distances. let's map that to actual files.
            # files can have multiple phashes, and phashes can refer to multiple files, so let's make sure we are setting the smallest distance we found

            similar_perceptual_hash_ids = list(
                similar_perceptual_hash_ids_to_distances.keys())

            with self._MakeTemporaryIntegerTable(
                    similar_perceptual_hash_ids,
                    'phash_id') as temp_table_name:

                # temp phashes to hash map
                similar_perceptual_hash_ids_to_hash_ids = HydrusData.BuildKeyToListDict(
                    self._Execute(
                        'SELECT phash_id, hash_id FROM {} CROSS JOIN shape_perceptual_hash_map USING ( phash_id );'
                        .format(temp_table_name)))

            similar_hash_ids_to_distances = {}

            for (perceptual_hash_id,
                 hash_ids) in similar_perceptual_hash_ids_to_hash_ids.items():

                distance = similar_perceptual_hash_ids_to_distances[
                    perceptual_hash_id]

                for hash_id in hash_ids:

                    if hash_id not in similar_hash_ids_to_distances:

                        similar_hash_ids_to_distances[hash_id] = distance

                    else:

                        current_distance = similar_hash_ids_to_distances[
                            hash_id]

                        if distance < current_distance:

                            similar_hash_ids_to_distances[hash_id] = distance

            similar_hash_ids_and_distances.extend(
                similar_hash_ids_to_distances.items())

        similar_hash_ids_and_distances = HydrusData.DedupeList(
            similar_hash_ids_and_distances)

        return similar_hash_ids_and_distances