Example #1
0
def ConvertAllParseResultsToFileSeeds( all_parse_results, source_url, file_import_options ):
    
    file_seeds = []
    
    seen_urls = set()
    
    for parse_results in all_parse_results:
        
        parsed_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_DESIRED, ), only_get_top_priority = True )
        
        parsed_urls = HydrusData.DedupeList( parsed_urls )
        
        parsed_urls = [ url for url in parsed_urls if url not in seen_urls ]
        
        seen_urls.update( parsed_urls )
        
        # note we do this recursively due to parse_results being appropriate only for these urls--don't move this out again, or tags will be messed up
        
        for url in parsed_urls:
            
            file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url )
            
            file_seed.SetReferralURL( source_url )
            
            file_seed.AddParseResults( parse_results, file_import_options )
            
            file_seeds.append( file_seed )
            
        
    
    return file_seeds
 def WorkOnURL( self, gallery_token_name, gallery_seed_log, file_seeds_callable, status_hook, title_hook, network_job_factory, network_job_presentation_context_factory, file_import_options, gallery_urls_seen_before = None ):
     
     if gallery_urls_seen_before is None:
         
         gallery_urls_seen_before = set()
         
     
     gallery_urls_seen_before.add( self.url )
     
     # maybe something like 'append urls' vs 'reverse-prepend' for subs or something
     
     # should also take--and populate--a set of urls we have seen this 'run', so we can bomb out if next_gallery_url ends up in some loop
     
     num_urls_added = 0
     num_urls_already_in_file_seed_cache = 0
     num_urls_total = 0
     result_404 = False
     added_new_gallery_pages = False
     stop_reason = ''
     
     try:
         
         gallery_url = self.url
         
         url_for_child_referral = gallery_url
         
         ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( gallery_url )
         
         if url_type not in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_WATCHABLE ):
             
             raise HydrusExceptions.VetoException( 'Did not recognise this as a gallery or watchable URL!' )
             
         
         if not can_parse:
             
             raise HydrusExceptions.VetoException( 'Cannot parse {}: {}'.format( match_name, cannot_parse_reason) )
             
         
         ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url )
         
         status_hook( 'downloading gallery page' )
         
         if self._referral_url is not None and self._referral_url != url_to_check:
             
             referral_url = self._referral_url
             
         elif gallery_url != url_to_check:
             
             referral_url = gallery_url
             
         else:
             
             referral_url = None
             
         
         network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url )
         
         network_job.SetGalleryToken( gallery_token_name )
         
         network_job.OverrideBandwidth( 30 )
         
         HG.client_controller.network_engine.AddJob( network_job )
         
         with network_job_presentation_context_factory( network_job ) as njpc:
             
             network_job.WaitUntilDone()
             
         
         parsing_text = network_job.GetContentText()
         
         actual_fetched_url = network_job.GetActualFetchedURL()
         
         do_parse = True
         
         if actual_fetched_url != url_to_check:
             
             ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url )
             
             if url_type == HC.URL_TYPE_GALLERY:
                 
                 if can_parse:
                     
                     gallery_url = actual_fetched_url
                     
                     url_for_child_referral = gallery_url
                     
                     ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url )
                     
                 else:
                     
                     do_parse = False
                     
                     status = CC.STATUS_ERROR
                     
                     note = 'Could not parse {}: {}'.format( match_name, cannot_parse_reason )
                     
                 
             else:
                 
                 do_parse = False
                 
                 from hydrus.client.importing import ClientImportFileSeeds
                 
                 file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, actual_fetched_url )
                 
                 file_seed.SetReferralURL( url_for_child_referral )
                 
                 file_seeds = [ file_seed ]
                 
                 file_seeds_callable( ( file_seed, ) )
                 
                 status = CC.STATUS_SUCCESSFUL_AND_NEW
                 
                 note = 'was redirected to a non-gallery url, which has been queued as a file import'
                 
             
         
         if do_parse:
             
             parsing_context = {}
             
             parsing_context[ 'gallery_url' ] = gallery_url
             parsing_context[ 'url' ] = url_to_check
             parsing_context[ 'post_index' ] = '0'
             
             all_parse_results = parser.Parse( parsing_context, parsing_text )
             
             if len( all_parse_results ) == 0:
                 
                 raise HydrusExceptions.VetoException( 'The parser found nothing in the document!' )
                 
             
             file_seeds = ClientImporting.ConvertAllParseResultsToFileSeeds( all_parse_results, url_for_child_referral, file_import_options )
             
             title = ClientParsing.GetTitleFromAllParseResults( all_parse_results )
             
             if title is not None:
                 
                 title_hook( title )
                 
             
             for file_seed in file_seeds:
                 
                 file_seed.SetExternalFilterableTags( self._external_filterable_tags )
                 file_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags )
                 
             
             num_urls_total = len( file_seeds )
             
             ( num_urls_added, num_urls_already_in_file_seed_cache, can_search_for_more_files, stop_reason ) = file_seeds_callable( file_seeds )
             
             status = CC.STATUS_SUCCESSFUL_AND_NEW
             
             note = HydrusData.ToHumanInt( num_urls_added ) + ' new urls found'
             
             if num_urls_already_in_file_seed_cache > 0:
                 
                 note += ' (' + HydrusData.ToHumanInt( num_urls_already_in_file_seed_cache ) + ' of page already in)'
                 
             
             if not can_search_for_more_files:
                 
                 note += ' - ' + stop_reason
                 
             
             if parser.CanOnlyGenerateGalleryURLs() or self._force_next_page_url_generation:
                 
                 can_add_more_gallery_urls = True
                 
             else:
                 
                 # only keep searching if we found any files, otherwise this could be a blank results page with another stub page
                 can_add_more_gallery_urls = num_urls_added > 0 and can_search_for_more_files
                 
             
             flattened_results = list( itertools.chain.from_iterable( all_parse_results ) )
             
             sub_gallery_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_SUB_GALLERY, ), only_get_top_priority = True )
             
             sub_gallery_urls = HydrusData.DedupeList( sub_gallery_urls )
             
             new_sub_gallery_urls = [ sub_gallery_url for sub_gallery_url in sub_gallery_urls if sub_gallery_url not in gallery_urls_seen_before ]
             
             num_new_sub_gallery_urls = len( new_sub_gallery_urls )
             
             if num_new_sub_gallery_urls > 0:
                 
                 sub_gallery_seeds = [ GallerySeed( sub_gallery_url ) for sub_gallery_url in new_sub_gallery_urls ]
                 
                 for sub_gallery_seed in sub_gallery_seeds:
                     
                     sub_gallery_seed.SetRunToken( self._run_token )
                     sub_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags )
                     sub_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags )
                     
                 
                 gallery_seed_log.AddGallerySeeds( sub_gallery_seeds )
                 
                 added_new_gallery_pages = True
                 
                 gallery_urls_seen_before.update( sub_gallery_urls )
                 
                 note += ' - {} sub-gallery urls found'.format( HydrusData.ToHumanInt( num_new_sub_gallery_urls ) )
                 
             
             if self._can_generate_more_pages and can_add_more_gallery_urls:
                 
                 next_page_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_NEXT, ), only_get_top_priority = True )
                 
                 if self.url in next_page_urls:
                     
                     next_page_urls.remove( self.url )
                     
                 
                 if url_to_check in next_page_urls:
                     
                     next_page_urls.remove( url_to_check )
                     
                 
                 if len( next_page_urls ) > 0:
                     
                     next_page_generation_phrase = ' next gallery pages found'
                     
                 else:
                     
                     # we have failed to parse a next page url, but we would still like one, so let's see if the url match can provide one
                     
                     url_class = HG.client_controller.network_engine.domain_manager.GetURLClass( url_to_check )
                     
                     if url_class is not None and url_class.CanGenerateNextGalleryPage():
                         
                         try:
                             
                             next_page_url = url_class.GetNextGalleryPage( url_to_check )
                             
                             next_page_urls = [ next_page_url ]
                             
                         except Exception as e:
                             
                             note += ' - Attempted to generate a next gallery page url, but failed!'
                             note += os.linesep
                             note += traceback.format_exc()
                             
                         
                     
                     next_page_generation_phrase = ' next gallery pages extrapolated from url class'
                     
                 
                 if len( next_page_urls ) > 0:
                     
                     next_page_urls = HydrusData.DedupeList( next_page_urls )
                     
                     new_next_page_urls = [ next_page_url for next_page_url in next_page_urls if next_page_url not in gallery_urls_seen_before ]
                     
                     duplicate_next_page_urls = gallery_urls_seen_before.intersection( new_next_page_urls )
                     
                     num_new_next_page_urls = len( new_next_page_urls )
                     num_dupe_next_page_urls = len( duplicate_next_page_urls )
                     
                     if num_new_next_page_urls > 0:
                         
                         next_gallery_seeds = [ GallerySeed( next_page_url ) for next_page_url in new_next_page_urls ]
                         
                         for next_gallery_seed in next_gallery_seeds:
                             
                             next_gallery_seed.SetRunToken( self._run_token )
                             next_gallery_seed.SetReferralURL( url_for_child_referral )
                             next_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags )
                             next_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags )
                             
                         
                         gallery_seed_log.AddGallerySeeds( next_gallery_seeds )
                         
                         added_new_gallery_pages = True
                         
                         gallery_urls_seen_before.update( new_next_page_urls )
                         
                         if num_dupe_next_page_urls == 0:
                             
                             note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase
                             
                         else:
                             
                             note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase + ', but ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' had already been visited this run and were not added'
                             
                         
                     else:
                         
                         note += ' - ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + next_page_generation_phrase + ', but they had already been visited this run and were not added'
                         
                     
                 
             
         
         self.SetStatus( status, note = note )
         
     except HydrusExceptions.ShutdownException:
         
         pass
         
     except HydrusExceptions.VetoException as e:
         
         status = CC.STATUS_VETOED
         
         note = str( e )
         
         self.SetStatus( status, note = note )
         
         if isinstance( e, HydrusExceptions.CancelledException ):
             
             status_hook( 'cancelled!' )
             
             time.sleep( 2 )
             
         
     except HydrusExceptions.InsufficientCredentialsException:
         
         status = CC.STATUS_VETOED
         note = '403'
         
         self.SetStatus( status, note = note )
         
         status_hook( '403' )
         
         time.sleep( 2 )
         
         result_404 = True
         
     except HydrusExceptions.NotFoundException:
         
         status = CC.STATUS_VETOED
         note = '404'
         
         self.SetStatus( status, note = note )
         
         status_hook( '404' )
         
         time.sleep( 2 )
         
         result_404 = True
         
     except Exception as e:
         
         status = CC.STATUS_ERROR
         
         self.SetStatus( status, exception = e )
         
         status_hook( 'error!' )
         
         time.sleep( 3 )
         
         if isinstance( e, HydrusExceptions.NetworkException ): # so the larger queue can set a delaywork or whatever
             
             raise
             
         
     finally:
         
         gallery_seed_log.NotifyGallerySeedsUpdated( ( self, ) )
         
     
     return ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total, result_404, added_new_gallery_pages, stop_reason )