def ConvertAllParseResultsToFileSeeds( all_parse_results, source_url, file_import_options ): file_seeds = [] seen_urls = set() for parse_results in all_parse_results: parsed_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_DESIRED, ), only_get_top_priority = True ) parsed_urls = HydrusData.DedupeList( parsed_urls ) parsed_urls = [ url for url in parsed_urls if url not in seen_urls ] seen_urls.update( parsed_urls ) # note we do this recursively due to parse_results being appropriate only for these urls--don't move this out again, or tags will be messed up for url in parsed_urls: file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url ) file_seed.SetReferralURL( source_url ) file_seed.AddParseResults( parse_results, file_import_options ) file_seeds.append( file_seed ) return file_seeds
def WorkOnURL( self, gallery_token_name, gallery_seed_log, file_seeds_callable, status_hook, title_hook, network_job_factory, network_job_presentation_context_factory, file_import_options, gallery_urls_seen_before = None ): if gallery_urls_seen_before is None: gallery_urls_seen_before = set() gallery_urls_seen_before.add( self.url ) # maybe something like 'append urls' vs 'reverse-prepend' for subs or something # should also take--and populate--a set of urls we have seen this 'run', so we can bomb out if next_gallery_url ends up in some loop num_urls_added = 0 num_urls_already_in_file_seed_cache = 0 num_urls_total = 0 result_404 = False added_new_gallery_pages = False stop_reason = '' try: gallery_url = self.url url_for_child_referral = gallery_url ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( gallery_url ) if url_type not in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_WATCHABLE ): raise HydrusExceptions.VetoException( 'Did not recognise this as a gallery or watchable URL!' ) if not can_parse: raise HydrusExceptions.VetoException( 'Cannot parse {}: {}'.format( match_name, cannot_parse_reason) ) ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url ) status_hook( 'downloading gallery page' ) if self._referral_url is not None and self._referral_url != url_to_check: referral_url = self._referral_url elif gallery_url != url_to_check: referral_url = gallery_url else: referral_url = None network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url ) network_job.SetGalleryToken( gallery_token_name ) network_job.OverrideBandwidth( 30 ) HG.client_controller.network_engine.AddJob( network_job ) with network_job_presentation_context_factory( network_job ) as njpc: network_job.WaitUntilDone() parsing_text = network_job.GetContentText() actual_fetched_url = network_job.GetActualFetchedURL() do_parse = True if actual_fetched_url != url_to_check: ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url ) if url_type == HC.URL_TYPE_GALLERY: if can_parse: gallery_url = actual_fetched_url url_for_child_referral = gallery_url ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url ) else: do_parse = False status = CC.STATUS_ERROR note = 'Could not parse {}: {}'.format( match_name, cannot_parse_reason ) else: do_parse = False from hydrus.client.importing import ClientImportFileSeeds file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, actual_fetched_url ) file_seed.SetReferralURL( url_for_child_referral ) file_seeds = [ file_seed ] file_seeds_callable( ( file_seed, ) ) status = CC.STATUS_SUCCESSFUL_AND_NEW note = 'was redirected to a non-gallery url, which has been queued as a file import' if do_parse: parsing_context = {} parsing_context[ 'gallery_url' ] = gallery_url parsing_context[ 'url' ] = url_to_check parsing_context[ 'post_index' ] = '0' all_parse_results = parser.Parse( parsing_context, parsing_text ) if len( all_parse_results ) == 0: raise HydrusExceptions.VetoException( 'The parser found nothing in the document!' ) file_seeds = ClientImporting.ConvertAllParseResultsToFileSeeds( all_parse_results, url_for_child_referral, file_import_options ) title = ClientParsing.GetTitleFromAllParseResults( all_parse_results ) if title is not None: title_hook( title ) for file_seed in file_seeds: file_seed.SetExternalFilterableTags( self._external_filterable_tags ) file_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags ) num_urls_total = len( file_seeds ) ( num_urls_added, num_urls_already_in_file_seed_cache, can_search_for_more_files, stop_reason ) = file_seeds_callable( file_seeds ) status = CC.STATUS_SUCCESSFUL_AND_NEW note = HydrusData.ToHumanInt( num_urls_added ) + ' new urls found' if num_urls_already_in_file_seed_cache > 0: note += ' (' + HydrusData.ToHumanInt( num_urls_already_in_file_seed_cache ) + ' of page already in)' if not can_search_for_more_files: note += ' - ' + stop_reason if parser.CanOnlyGenerateGalleryURLs() or self._force_next_page_url_generation: can_add_more_gallery_urls = True else: # only keep searching if we found any files, otherwise this could be a blank results page with another stub page can_add_more_gallery_urls = num_urls_added > 0 and can_search_for_more_files flattened_results = list( itertools.chain.from_iterable( all_parse_results ) ) sub_gallery_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_SUB_GALLERY, ), only_get_top_priority = True ) sub_gallery_urls = HydrusData.DedupeList( sub_gallery_urls ) new_sub_gallery_urls = [ sub_gallery_url for sub_gallery_url in sub_gallery_urls if sub_gallery_url not in gallery_urls_seen_before ] num_new_sub_gallery_urls = len( new_sub_gallery_urls ) if num_new_sub_gallery_urls > 0: sub_gallery_seeds = [ GallerySeed( sub_gallery_url ) for sub_gallery_url in new_sub_gallery_urls ] for sub_gallery_seed in sub_gallery_seeds: sub_gallery_seed.SetRunToken( self._run_token ) sub_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags ) sub_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags ) gallery_seed_log.AddGallerySeeds( sub_gallery_seeds ) added_new_gallery_pages = True gallery_urls_seen_before.update( sub_gallery_urls ) note += ' - {} sub-gallery urls found'.format( HydrusData.ToHumanInt( num_new_sub_gallery_urls ) ) if self._can_generate_more_pages and can_add_more_gallery_urls: next_page_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_NEXT, ), only_get_top_priority = True ) if self.url in next_page_urls: next_page_urls.remove( self.url ) if url_to_check in next_page_urls: next_page_urls.remove( url_to_check ) if len( next_page_urls ) > 0: next_page_generation_phrase = ' next gallery pages found' else: # we have failed to parse a next page url, but we would still like one, so let's see if the url match can provide one url_class = HG.client_controller.network_engine.domain_manager.GetURLClass( url_to_check ) if url_class is not None and url_class.CanGenerateNextGalleryPage(): try: next_page_url = url_class.GetNextGalleryPage( url_to_check ) next_page_urls = [ next_page_url ] except Exception as e: note += ' - Attempted to generate a next gallery page url, but failed!' note += os.linesep note += traceback.format_exc() next_page_generation_phrase = ' next gallery pages extrapolated from url class' if len( next_page_urls ) > 0: next_page_urls = HydrusData.DedupeList( next_page_urls ) new_next_page_urls = [ next_page_url for next_page_url in next_page_urls if next_page_url not in gallery_urls_seen_before ] duplicate_next_page_urls = gallery_urls_seen_before.intersection( new_next_page_urls ) num_new_next_page_urls = len( new_next_page_urls ) num_dupe_next_page_urls = len( duplicate_next_page_urls ) if num_new_next_page_urls > 0: next_gallery_seeds = [ GallerySeed( next_page_url ) for next_page_url in new_next_page_urls ] for next_gallery_seed in next_gallery_seeds: next_gallery_seed.SetRunToken( self._run_token ) next_gallery_seed.SetReferralURL( url_for_child_referral ) next_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags ) next_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags ) gallery_seed_log.AddGallerySeeds( next_gallery_seeds ) added_new_gallery_pages = True gallery_urls_seen_before.update( new_next_page_urls ) if num_dupe_next_page_urls == 0: note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase else: note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase + ', but ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' had already been visited this run and were not added' else: note += ' - ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + next_page_generation_phrase + ', but they had already been visited this run and were not added' self.SetStatus( status, note = note ) except HydrusExceptions.ShutdownException: pass except HydrusExceptions.VetoException as e: status = CC.STATUS_VETOED note = str( e ) self.SetStatus( status, note = note ) if isinstance( e, HydrusExceptions.CancelledException ): status_hook( 'cancelled!' ) time.sleep( 2 ) except HydrusExceptions.InsufficientCredentialsException: status = CC.STATUS_VETOED note = '403' self.SetStatus( status, note = note ) status_hook( '403' ) time.sleep( 2 ) result_404 = True except HydrusExceptions.NotFoundException: status = CC.STATUS_VETOED note = '404' self.SetStatus( status, note = note ) status_hook( '404' ) time.sleep( 2 ) result_404 = True except Exception as e: status = CC.STATUS_ERROR self.SetStatus( status, exception = e ) status_hook( 'error!' ) time.sleep( 3 ) if isinstance( e, HydrusExceptions.NetworkException ): # so the larger queue can set a delaywork or whatever raise finally: gallery_seed_log.NotifyGallerySeedsUpdated( ( self, ) ) return ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total, result_404, added_new_gallery_pages, stop_reason )