def recall_api(request: HttpRequest, pk: int) -> HttpResponse: """Recall provider API, if possible.""" if not request.user.is_staff: return render_error(request, "You need to be an admin to recall the API.") try: archive = Archive.objects.get(pk=pk) except Archive.DoesNotExist: raise Http404("Archive does not exist") if not archive.gallery_id: return render_error(request, "No gallery associated with this archive.") gallery = Gallery.objects.get(pk=archive.gallery_id) current_settings = Settings(load_from_config=crawler_settings.config) if current_settings.workers.web_queue: current_settings.set_update_metadata_options( providers=(gallery.provider, )) current_settings.workers.web_queue.enqueue_args_list( (gallery.get_link(), ), override_options=current_settings) frontend_logger.info( 'Updating gallery API data for gallery: {} and related archives'. format(gallery.get_absolute_url())) return HttpResponseRedirect(request.META["HTTP_REFERER"])
def test_nhentai_parser(self): """Test Nhentai gallery page parser""" settings = Settings(load_from_disk=True) gallery_link = 'https://nhentai.net/g/198482/' parser = NhentaiParser(settings) data = parser.fetch_gallery_data(gallery_link) expected_data = GalleryData( 'nh-198482', 'nhentai', title= '(C90) [MeltdoWN COmet (Yukiu Con)] C90 Omakebon! (Pokémon GO) [English] [ATF]', title_jpn='(C90) [MeltdoWN COmet (雪雨こん)] C90 おまけ本! (ポケモンGO) [英訳]', filecount=9, link='https://nhentai.net/g/198482/', posted=dateutil.parser.parse('2017-06-19T10:33:19.022360+00:00'), category='Doujinshi', tags=[ 'parody:pokemon', 'lolicon', 'sole_female', 'sole_male', 'b*****b', 'artist:yukiu_con', 'group:meltdown_comet', 'language:translated', 'language:english', ]) self.assertEqual(data, expected_data)
def job(self) -> None: while not self.stop.is_set(): seconds_to_wait = self.wait_until_next_run() if self.stop.wait(timeout=seconds_to_wait): return if self.settings.autochecker.enable: connection.close() self.crawler_logger.info("Starting timed auto search") current_settings = Settings( load_from_config=self.settings.config) current_settings.silent_processing = True current_settings.replace_metadata = True self.web_queue.enqueue_args_list( ['-feed', '-wanted'], override_options=current_settings) self.update_last_run(django_tz.now())
def recall_api(request: HttpRequest, pk: int) -> HttpResponse: """Recall provider API, if possible.""" if not request.user.has_perm('viewer.update_metadata'): return render_error(request, "You don't have the permission to refresh source metadata on an Archive.") try: archive = Archive.objects.get(pk=pk) except Archive.DoesNotExist: raise Http404("Archive does not exist") if not archive.gallery_id: return render_error(request, "No gallery associated with this archive.") gallery = Gallery.objects.get(pk=archive.gallery_id) current_settings = Settings(load_from_config=crawler_settings.config) if current_settings.workers.web_queue and gallery.provider: current_settings.set_update_metadata_options(providers=(gallery.provider,)) def gallery_callback(x: Optional['Gallery'], crawled_url: Optional[str], result: str) -> None: event_log( request.user, 'UPDATE_METADATA', content_object=x, result=result, data=crawled_url ) current_settings.workers.web_queue.enqueue_args_list( (gallery.get_link(),), override_options=current_settings, gallery_callback=gallery_callback ) logger.info( 'Updating gallery API data for gallery: {} and related archives'.format( gallery.get_absolute_url() ) ) return HttpResponseRedirect(request.META["HTTP_REFERER"])
def job(self) -> None: while not self.stop.is_set(): seconds_to_wait = self.wait_until_next_run() if self.stop.wait(timeout=seconds_to_wait): return if self.settings.autoupdater.enable: current_settings = Settings(load_from_config=self.settings.config) current_settings.keep_dl_type = True current_settings.silent_processing = True current_settings.config['allowed']['replace_metadata'] = 'yes' connection.close() start_date = django_tz.now() - timedelta(seconds=int(self.timer)) - timedelta(days=self.settings.autoupdater.buffer_back) end_date = django_tz.now() - timedelta(days=self.settings.autoupdater.buffer_after) to_update_providers = current_settings.autoupdater.providers galleries = Gallery.objects.eligible_for_use( posted__gte=start_date, posted__lte=end_date, provider__in=to_update_providers ) if not galleries: logger.info( "No galleries posted from {} to {} need updating. Providers: {}".format( start_date, end_date, ", ".join(to_update_providers) ) ) else: # Leave only info downloaders, then leave only enabled auto updated providers downloaders = current_settings.provider_context.get_downloaders_name_priority(current_settings, filter_name='info') downloaders_names = [x[0] for x in downloaders if x[0].replace("_info", "") in to_update_providers] current_settings.allow_downloaders_only(downloaders_names, True, True, True) url_list = [x.get_link() for x in galleries] logger.info( "Starting timed auto updater, updating {} galleries " "posted from {} to {}. Providers: {}".format( len(url_list), start_date, end_date, ", ".join(to_update_providers) ) ) url_list.append('--update-mode') self.web_queue.enqueue_args_list(url_list, override_options=current_settings) self.update_last_run(django_tz.now())
def __init__(self, bus: Bus, settings_module: str = 'settings', wsgi_http_logger: type = HTTPLogger, local_settings: Settings = None) -> None: """ CherryPy engine plugin to configure and mount the Django application onto the CherryPy server. """ plugins.SimplePlugin.__init__(self, bus) self.settings_module = settings_module self.wsgi_http_logger = wsgi_http_logger if local_settings: self.crawler_settings = local_settings else: self.crawler_settings = Settings(load_from_disk=True)
def test_nexus_parser(self): """Test Nexus gallery page parser""" settings = Settings(load_from_disk=True) gallery_link = 'https://hentainexus.com/view/5665' parser = NexusParser(settings) data = parser.fetch_gallery_data(gallery_link) expected_data = GalleryData( '5665', 'nexus', link=gallery_link, archiver_key='https://hentainexus.com/zip/5665', title='Sase-san is Very Popular', thumbnail_url= 'https://static.hentainexus.com/content/5665/cover.jpg', filecount=16, filesize=0, expunged=False, posted=None, category='Manga', tags=[ 'artist:wantan_meo', 'language:english', 'magazine:comic_kairakuten_2019-04', 'parody:original_work', 'publisher:fakku', 'creampie', 'fangs', 'hairy', 'hentai', 'office_lady', 'oppai', 'uncensored', 'vanilla', ], comment='Let\'s chug \'em down! ♪', ) self.assertEqual(data, expected_data)
""" # Build paths inside the project like this: os.path.join(BASE_DIR, ...) import os from typing import Any, Optional from core.base.setup import Settings from core.base.utilities import module_exists if 'PANDA_BASE_DIR' in os.environ: BASE_DIR = os.environ['PANDA_BASE_DIR'] else: BASE_DIR = os.path.dirname(os.path.dirname(__file__)) if 'PANDA_CONFIG_DIR' in os.environ: crawler_settings = Settings(load_from_disk=True, default_dir=os.environ['PANDA_CONFIG_DIR']) else: crawler_settings = Settings(load_from_disk=True) MAIN_LOGGER = crawler_settings.log_location if not os.path.exists(os.path.dirname(MAIN_LOGGER)): os.makedirs(os.path.dirname(MAIN_LOGGER)) # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = crawler_settings.django_secret_key # SECURITY WARNING: don't run with debug turned on in production! DEBUG = crawler_settings.django_debug_mode # Might want to limit it here.
def missing_archives_for_galleries(request: HttpRequest) -> HttpResponse: p = request.POST get = request.GET title = get.get("title", '') tags = get.get("tags", '') try: page = int(get.get("page", '1')) except ValueError: page = 1 if 'clear' in get: form = GallerySearchForm() else: form = GallerySearchForm(initial={'title': title, 'tags': tags}) if p and request.user.is_staff: pks = [] for k, v in p.items(): if k.startswith("sel-"): # k, pk = k.split('-') # results[pk][k] = v pks.append(v) results = Gallery.objects.filter(id__in=pks).order_by('-create_date') if 'delete_galleries' in p: for gallery in results: message = 'Removing gallery: {}, link: {}'.format( gallery.title, gallery.get_link()) frontend_logger.info(message) messages.success(request, message) gallery.mark_as_deleted() elif 'download_galleries' in p: for gallery in results: message = 'Queueing gallery: {}, link: {}'.format( gallery.title, gallery.get_link()) frontend_logger.info(message) messages.success(request, message) # Force replace_metadata when queueing from this list, since it's mostly used to download non used. current_settings = Settings( load_from_config=crawler_settings.config) if current_settings.workers.web_queue: current_settings.replace_metadata = True current_settings.retry_failed = True if 'reason' in p and p['reason'] != '': reason = p['reason'] # Force limit string length (reason field max_length) current_settings.archive_reason = reason[:200] current_settings.archive_details = gallery.reason current_settings.gallery_reason = reason[:200] elif gallery.reason: current_settings.archive_reason = gallery.reason current_settings.workers.web_queue.enqueue_args_list( (gallery.get_link(), ), override_options=current_settings) elif 'recall_api' in p: message = 'Recalling API for {} galleries'.format(results.count()) frontend_logger.info(message) messages.success(request, message) gallery_links = [x.get_link() for x in results] gallery_providers = list( results.values_list('provider', flat=True).distinct()) current_settings = Settings( load_from_config=crawler_settings.config) if current_settings.workers.web_queue: current_settings.set_update_metadata_options( providers=gallery_providers) current_settings.workers.web_queue.enqueue_args_list( gallery_links, override_options=current_settings) if 'force_public' in request.GET: force_public = True else: force_public = False if request.user.is_staff and not force_public: providers = Gallery.objects.all().values_list('provider', flat=True).distinct() params = {} for k, v in get.items(): params[k] = v for k in gallery_filter_keys: if k not in params: params[k] = '' results = filter_galleries_simple(params) results = results.non_used_galleries().prefetch_related( 'foundgallery_set') paginator = Paginator(results, 50) try: results = paginator.page(page) except (InvalidPage, EmptyPage): results = paginator.page(paginator.num_pages) d = { 'results': results, 'providers': providers, 'force_public': force_public, 'form': form } else: params = {} for k, v in get.items(): params[k] = v for k in gallery_filter_keys: if k not in params: params[k] = '' results = filter_galleries_simple(params) results = results.non_used_galleries(public=True, provider__in=['panda', 'fakku']) d = {'results': results} return render(request, "viewer/archives_missing_for_galleries.html", d)
def user_crawler(request: HttpRequest) -> HttpResponse: """Crawl given URLs.""" d = {} p = request.POST all_downloaders = crawler_settings.provider_context.get_downloaders_name_priority( crawler_settings, filter_name='generic_') # providers_not_generic = list(set([x[0].provider for x in all_downloaders if not x[0].provider.is_generic()])) generic_downloaders = [x[0] for x in all_downloaders] user_reason = p.get('reason', '') if p: current_settings = Settings(load_from_config=crawler_settings.config) if not current_settings.workers.web_queue: messages.error( request, 'Cannot submit links currently. Please contact an admin.') return HttpResponseRedirect(request.META["HTTP_REFERER"]) url_set = set() # create dictionary of properties for each archive current_settings.replace_metadata = False current_settings.config['allowed']['replace_metadata'] = 'no' for k, v in p.items(): if k == "downloader": if v == 'no-generic': continue elif v in generic_downloaders: current_settings.enable_downloader_only(v) elif k == "urls": url_list = v.split("\n") for item in url_list: url_set.add(item.rstrip('\r')) urls = list(url_set) if not urls: messages.error(request, 'Submission is empty.') return HttpResponseRedirect(request.META["HTTP_REFERER"]) if 'reason' in p and p['reason'] != '': reason = p['reason'] # Force limit string length (reason field max_length) current_settings.archive_reason = reason[:200] current_settings.gallery_reason = reason[:200] if 'source' in p and p['source'] != '': source = p['source'] # Force limit string length (reason field max_length) current_settings.archive_source = source[:50] current_settings.archive_user = request.user parsers = crawler_settings.provider_context.get_parsers_classes() def archive_callback(x: Optional['Archive'], crawled_url: Optional[str], result: str) -> None: event_log(request.user, 'ADD_ARCHIVE', reason=user_reason, content_object=x, result=result, data=crawled_url) def gallery_callback(x: Optional['Gallery'], crawled_url: Optional[str], result: str) -> None: event_log(request.user, 'ADD_GALLERY', reason=user_reason, content_object=x, result=result, data=crawled_url) current_settings.workers.web_queue.enqueue_args_list( urls, override_options=current_settings, archive_callback=archive_callback, gallery_callback=gallery_callback, use_argparser=False) messages.success( request, 'Starting Crawler, if the links were correctly added, they should appear on the archive or gallery list.' ) for url in urls: frontend_logger.info("User {}: queued link: {}".format( request.user.username, url)) # event_log( # request.user, # 'CRAWL_URL', # reason=user_reason, # data=url, # result='queue' # ) found_valid_urls: List[str] = [] for parser in parsers: if parser.id_from_url_implemented(): urls_filtered = parser.filter_accepted_urls(urls) found_valid_urls.extend(urls_filtered) for url_filtered in urls_filtered: gid = parser.id_from_url(url_filtered) gallery = Gallery.objects.filter(gid=gid).first() if not gallery: messages.success( request, '{}: New URL, will be added to the submit queue'. format(url_filtered)) event_log(request.user, 'CRAWL_URL', reason=user_reason, data=url_filtered, result='queued') continue if gallery.is_submitted(): messages.info( request, '{}: Already in submit queue, link: {}, reason: {}' .format(url_filtered, gallery.get_absolute_url(), gallery.reason)) event_log(request.user, 'CRAWL_URL', reason=user_reason, data=url_filtered, result='already_submitted') elif gallery.public: messages.info( request, '{}: Already present, is public: {}'.format( url_filtered, request.build_absolute_uri( gallery.get_absolute_url()))) event_log(request.user, 'CRAWL_URL', reason=user_reason, data=url_filtered, result='already_public') else: messages.info( request, '{}: Already present, is not public: {}'.format( url_filtered, request.build_absolute_uri( gallery.get_absolute_url()))) event_log(request.user, 'CRAWL_URL', reason=user_reason, data=url_filtered, result='already_private') extra_urls = [x for x in urls if x not in found_valid_urls] for extra_url in extra_urls: messages.info(request, '{}: Extra non-provider URLs'.format(extra_url)) event_log(request.user, 'CRAWL_URL', reason=user_reason, data=extra_url, result='queued') # Not really optimal when there's many commands being queued # for command in url_list: # messages.success(request, command) return HttpResponseRedirect(request.META["HTTP_REFERER"]) d.update({'downloaders': generic_downloaders}) return render(request, "viewer/collaborators/gallery_crawler.html", d)
def submit_queue(request: HttpRequest) -> HttpResponse: p = request.POST get = request.GET title = get.get("title", '') tags = get.get("tags", '') user_reason = p.get('reason', '') try: page = int(get.get("page", '1')) except ValueError: page = 1 if 'clear' in get: form = GallerySearchForm() else: form = GallerySearchForm(initial={'title': title, 'tags': tags}) if p: pks = [] for k, v in p.items(): if k.startswith("sel-"): # k, pk = k.split('-') # results[pk][k] = v pks.append(v) preserved = Case( *[When(pk=pk, then=pos) for pos, pk in enumerate(pks)]) if 'denied' in get: results = Gallery.objects.submitted_galleries( id__in=pks).order_by(preserved) else: results = Gallery.objects.submitted_galleries( ~Q(status=Gallery.DENIED), id__in=pks).order_by(preserved) if 'deny_galleries' in p: for gallery in results: message = 'Denying gallery: {}, link: {}, source link: {}'.format( gallery.title, gallery.get_absolute_url(), gallery.get_link()) if 'reason' in p and p['reason'] != '': message += ', reason: {}'.format(p['reason']) frontend_logger.info("User {}: {}".format( request.user.username, message)) messages.success(request, message) gallery.mark_as_denied() event_log(request.user, 'DENY_GALLERY', reason=user_reason, content_object=gallery, result='denied') elif 'download_galleries' in p: for gallery in results: message = 'Queueing gallery: {}, link: {}, source link: {}'.format( gallery.title, gallery.get_absolute_url(), gallery.get_link()) if 'reason' in p and p['reason'] != '': message += ', reason: {}'.format(p['reason']) frontend_logger.info("User {}: {}".format( request.user.username, message)) messages.success(request, message) event_log(request.user, 'ACCEPT_GALLERY', reason=user_reason, content_object=gallery, result='accepted') # Force replace_metadata when queueing from this list, since it's mostly used to download non used. current_settings = Settings( load_from_config=crawler_settings.config) if current_settings.workers.web_queue: current_settings.replace_metadata = True current_settings.retry_failed = True if 'reason' in p and p['reason'] != '': reason = p['reason'] # Force limit string length (reason field max_length) current_settings.archive_reason = reason[:200] current_settings.archive_details = gallery.reason current_settings.gallery_reason = reason[:200] elif gallery.reason: current_settings.archive_reason = gallery.reason def archive_callback(x: Optional['Archive'], crawled_url: Optional[str], result: str) -> None: event_log(request.user, 'ADD_ARCHIVE', reason=user_reason, content_object=x, result=result, data=crawled_url) def gallery_callback(x: Optional['Gallery'], crawled_url: Optional[str], result: str) -> None: event_log(request.user, 'ADD_GALLERY', reason=user_reason, content_object=x, result=result, data=crawled_url) current_settings.workers.web_queue.enqueue_args_list( (gallery.get_link(), ), override_options=current_settings, archive_callback=archive_callback, gallery_callback=gallery_callback, ) providers = Gallery.objects.all().values_list('provider', flat=True).distinct() params = {} for k, v in get.items(): params[k] = v for k in gallery_filter_keys: if k not in params: params[k] = '' results = filter_galleries_simple(params) if 'denied' in get: results = results.submitted_galleries().prefetch_related( 'foundgallery_set') else: results = results.submitted_galleries(~Q( status=Gallery.DENIED)).prefetch_related('foundgallery_set') paginator = Paginator(results, 50) try: results = paginator.page(page) except (InvalidPage, EmptyPage): results = paginator.page(paginator.num_pages) d = {'results': results, 'providers': providers, 'form': form} return render(request, "viewer/collaborators/submit_queue.html", d)
def manage_archives(request: HttpRequest) -> HttpResponse: p = request.POST get = request.GET title = get.get("title", '') tags = get.get("tags", '') user_reason = p.get('reason', '') try: page = int(get.get("page", '1')) except ValueError: page = 1 if 'clear' in get: form = ArchiveSearchForm() else: form = ArchiveSearchForm(initial={'title': title, 'tags': tags}) if p: pks = [] for k, v in p.items(): if k.startswith("sel-"): # k, pk = k.split('-') # results[pk][k] = v pks.append(v) preserved = Case( *[When(pk=pk, then=pos) for pos, pk in enumerate(pks)]) archives = Archive.objects.filter(id__in=pks).order_by(preserved) if 'publish_archives' in p and request.user.has_perm( 'viewer.publish_archive'): for archive in archives: message = 'Publishing archive: {}, link: {}'.format( archive.title, archive.get_absolute_url()) if 'reason' in p and p['reason'] != '': message += ', reason: {}'.format(p['reason']) frontend_logger.info("User {}: {}".format( request.user.username, message)) messages.success(request, message) archive.set_public(reason=user_reason) event_log(request.user, 'PUBLISH_ARCHIVE', reason=user_reason, content_object=archive, result='published') elif 'unpublish_archives' in p and request.user.has_perm( 'viewer.publish_archive'): for archive in archives: message = 'Unpublishing archive: {}, link: {}'.format( archive.title, archive.get_absolute_url()) if 'reason' in p and p['reason'] != '': message += ', reason: {}'.format(p['reason']) frontend_logger.info("User {}: {}".format( request.user.username, message)) messages.success(request, message) archive.set_private(reason=user_reason) event_log(request.user, 'UNPUBLISH_ARCHIVE', reason=user_reason, content_object=archive, result='unpublished') elif 'delete_archives' in p and request.user.has_perm( 'viewer.delete_archive'): for archive in archives: message = 'Deleting archive: {}, link: {}, with it\'s file: {} and associated gallery: {}'.format( archive.title, archive.get_absolute_url(), archive.zipped.path, archive.gallery) if 'reason' in p and p['reason'] != '': message += ', reason: {}'.format(p['reason']) frontend_logger.info("User {}: {}".format( request.user.username, message)) messages.success(request, message) gallery = archive.gallery archive.gallery.mark_as_deleted() archive.gallery = None archive.delete_all_files() archive.delete() event_log(request.user, 'DELETE_ARCHIVE', content_object=gallery, reason=user_reason, result='deleted') elif 'update_metadata' in p and request.user.has_perm( 'viewer.update_metadata'): for archive in archives: gallery = archive.gallery message = 'Updating gallery API data for gallery: {} and related archives'.format( gallery.get_absolute_url()) if 'reason' in p and p['reason'] != '': message += ', reason: {}'.format(p['reason']) frontend_logger.info("User {}: {}".format( request.user.username, message)) messages.success(request, message) current_settings = Settings( load_from_config=crawler_settings.config) if current_settings.workers.web_queue: current_settings.set_update_metadata_options( providers=(gallery.provider, )) def gallery_callback(x: Optional['Gallery'], crawled_url: Optional[str], result: str) -> None: event_log(request.user, 'UPDATE_METADATA', reason=user_reason, content_object=x, result=result, data=crawled_url) current_settings.workers.web_queue.enqueue_args_list( (gallery.get_link(), ), override_options=current_settings, gallery_callback=gallery_callback) frontend_logger.info( 'Updating gallery API data for gallery: {} and related archives' .format(gallery.get_absolute_url())) elif 'add_to_group' in p and request.user.has_perm( 'viewer.change_archivegroup'): if 'archive_group' in p: archive_group_ids = p.getlist('archive_group') preserved = Case(*[ When(pk=pk, then=pos) for pos, pk in enumerate(archive_group_ids) ]) archive_groups = ArchiveGroup.objects.filter( pk__in=archive_group_ids).order_by(preserved) for archive in archives: for archive_group in archive_groups: if not ArchiveGroupEntry.objects.filter( archive=archive, archive_group=archive_group).exists(): archive_group_entry = ArchiveGroupEntry( archive=archive, archive_group=archive_group) archive_group_entry.save() message = 'Adding archive: {}, link: {}, to group: {}, link {}'.format( archive.title, archive.get_absolute_url(), archive_group.title, archive_group.get_absolute_url()) if 'reason' in p and p['reason'] != '': message += ', reason: {}'.format(p['reason']) frontend_logger.info("User {}: {}".format( request.user.username, message)) messages.success(request, message) event_log(request.user, 'ADD_ARCHIVE_TO_GROUP', content_object=archive, reason=user_reason, result='added') params = { 'sort': 'create_date', 'asc_desc': 'desc', 'filename': title, } for k, v in get.items(): params[k] = v for k in archive_filter_keys: if k not in params: params[k] = '' results = filter_archives_simple(params) results = results.prefetch_related('gallery') paginator = Paginator(results, 100) try: results = paginator.page(page) except (InvalidPage, EmptyPage): results = paginator.page(paginator.num_pages) d = {'results': results, 'form': form} if request.user.has_perm('viewer.change_archivegroup'): group_form = ArchiveGroupSelectForm() d.update(group_form=group_form) return render(request, "viewer/collaborators/manage_archives.html", d)
def json_parser(request: HttpRequest) -> HttpResponse: response = {} if request.method == 'POST': if not request.body: response['error'] = 'Empty request' return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8") data = json.loads(request.body.decode("utf-8")) if 'api_key' not in data: response['error'] = 'Missing API key' return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8") elif data['api_key'] != crawler_settings.api_key: response['error'] = 'Incorrect API key' return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8") # send some 'ok' back else: if 'operation' not in data or 'args' not in data: response['error'] = 'Wrong format' else: args = data['args'] response = {} # Used by internal pages and userscript if data['operation'] == 'webcrawler' and 'link' in args: if not crawler_settings.workers.web_queue: response['error'] = 'The webqueue is not running' elif 'downloader' in args: current_settings = Settings(load_from_config=crawler_settings.config) if not current_settings.workers.web_queue: response['error'] = 'The webqueue is not running' else: current_settings.allow_downloaders_only([args['downloader']], True, True, True) archive = None parsers = current_settings.provider_context.get_parsers(current_settings, crawler_logger) for parser in parsers: if parser.id_from_url_implemented(): urls_filtered = parser.filter_accepted_urls((args['link'], )) for url_filtered in urls_filtered: gallery_gid = parser.id_from_url(url_filtered) if gallery_gid: archive = Archive.objects.filter(gallery__gid=gallery_gid).first() if urls_filtered: break current_settings.workers.web_queue.enqueue_args_list((args['link'],), override_options=current_settings) if archive: response['message'] = "Archive exists, crawling to check for redownload: " + args['link'] else: response['message'] = "Crawling: " + args['link'] else: if 'parentLink' in args: parent_archive = None parsers = crawler_settings.provider_context.get_parsers(crawler_settings, crawler_logger) for parser in parsers: if parser.id_from_url_implemented(): urls_filtered = parser.filter_accepted_urls((args['parentLink'],)) for url_filtered in urls_filtered: gallery_gid = parser.id_from_url(url_filtered) if gallery_gid: parent_archive = Archive.objects.filter(gallery__gid=gallery_gid).first() if urls_filtered: break if parent_archive: link = parent_archive.gallery.get_link() if 'action' in args and args['action'] == 'replaceFound': parent_archive.gallery.mark_as_deleted() parent_archive.gallery = None parent_archive.delete_all_files() parent_archive.delete_files_but_archive() parent_archive.delete() response['message'] = "Crawling: " + args['link'] + ", deleting parent: " + link crawler_settings.workers.web_queue.enqueue_args(args['link']) elif 'action' in args and args['action'] == 'queueFound': response['message'] = "Crawling: " + args['link'] + ", keeping parent: " + link crawler_settings.workers.web_queue.enqueue_args(args['link']) else: response['message'] = "Please confirm deletion of parent: " + link response['action'] = 'confirmDeletion' else: archive = None parsers = crawler_settings.provider_context.get_parsers(crawler_settings, crawler_logger) for parser in parsers: if parser.id_from_url_implemented(): urls_filtered = parser.filter_accepted_urls((args['link'],)) for url_filtered in urls_filtered: gallery_gid = parser.id_from_url(url_filtered) if gallery_gid: archive = Archive.objects.filter(gallery__gid=gallery_gid).first() if urls_filtered: break if archive: response['message'] = "Archive exists, crawling to check for redownload: " + args['link'] else: response['message'] = "Crawling: " + args['link'] crawler_settings.workers.web_queue.enqueue_args(args['link']) else: archive = None parsers = crawler_settings.provider_context.get_parsers(crawler_settings, crawler_logger) for parser in parsers: if parser.id_from_url_implemented(): urls_filtered = parser.filter_accepted_urls((args['link'],)) for url_filtered in urls_filtered: gallery_gid = parser.id_from_url(url_filtered) if gallery_gid: archive = Archive.objects.filter(gallery__gid=gallery_gid).first() if urls_filtered: break if archive: response['message'] = "Archive exists, crawling to check for redownload: " + args['link'] else: response['message'] = "Crawling: " + args['link'] crawler_settings.workers.web_queue.enqueue_args(args['link']) if not response: response['error'] = 'Could not parse request' return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8") # Used by remotesite command elif data['operation'] == 'archive_request': archives_query = Archive.objects.filter_non_existent(crawler_settings.MEDIA_ROOT, gallery__gid__in=args) archives = [{'gid': archive.gallery.gid, 'id': archive.id, 'zipped': archive.zipped.name, 'filesize': archive.filesize} for archive in archives_query] response_text = json.dumps({'result': archives}) return HttpResponse(response_text, content_type="application/json; charset=utf-8") # Used by remotesite command elif data['operation'] in ('queue_archives', 'queue_galleries'): urls = args new_urls_set = set() gids_set = set() parsers = crawler_settings.provider_context.get_parsers(crawler_settings, crawler_logger) for parser in parsers: if parser.id_from_url_implemented(): urls_filtered = parser.filter_accepted_urls(urls) for url in urls_filtered: gid = parser.id_from_url(url) gids_set.add(gid) gids_list = list(gids_set) existing_galleries = Gallery.objects.filter(gid__in=gids_list) for gallery_object in existing_galleries: if gallery_object.is_submitted(): gallery_object.delete() # Delete queue galleries that failed, and does not have archives. elif data['operation'] == 'queue_archives' and "failed" in gallery_object.dl_type and not gallery_object.archive_set.all(): gallery_object.delete() elif data['operation'] == 'queue_archives' and not gallery_object.archive_set.all(): gallery_object.delete() already_present_gids = list(Gallery.objects.filter(gid__in=gids_list).values_list('gid', flat=True)) # new_gids = list(gids_set - set(already_present_gids)) for parser in parsers: if parser.id_from_url_implemented(): urls_filtered = parser.filter_accepted_urls(urls) for url in urls_filtered: gid = parser.id_from_url(url) if gid not in already_present_gids: new_urls_set.add(url) pages_links = list(new_urls_set) if len(pages_links) > 0: current_settings = Settings(load_from_config=crawler_settings.config) if data['operation'] == 'queue_galleries': current_settings.allow_type_downloaders_only('info') elif data['operation'] == 'queue_archives': if 'archive_reason' in data: current_settings.archive_reason = data['archive_reason'] if 'archive_details' in data: current_settings.archive_details = data['archive_details'] current_settings.allow_type_downloaders_only('fake') if current_settings.workers.web_queue: current_settings.workers.web_queue.enqueue_args_list(pages_links, override_options=current_settings) else: pages_links = [] return HttpResponse(json.dumps({'result': str(len(pages_links))}), content_type="application/json; charset=utf-8") # Used by remotesite command elif data['operation'] == 'links': links = args if len(links) > 0: crawler_settings.workers.web_queue.enqueue_args_list(links) return HttpResponse(json.dumps({'result': str(len(links))}), content_type="application/json; charset=utf-8") # Used by archive page elif data['operation'] == 'match_archive': archive = Archive.objects.filter(pk=args['archive']) if archive: generate_possible_matches_for_archives( archive, filters=(args['match_filter'],), logger=crawler_logger, match_local=False, match_web=True, ) return HttpResponse(json.dumps({'message': 'web matcher done, check the logs for results'}), content_type="application/json; charset=utf-8") elif data['operation'] == 'match_archive_internally': archive = Archive.objects.get(pk=args['archive']) if archive: clear_title = True if 'clear' in args else False provider_filter = args.get('provider', '') try: cutoff = float(request.GET.get('cutoff', '0.4')) except ValueError: cutoff = 0.4 try: max_matches = int(request.GET.get('max-matches', '10')) except ValueError: max_matches = 10 archive.generate_possible_matches( clear_title=clear_title, provider_filter=provider_filter, cutoff=cutoff, max_matches=max_matches ) archive.save() return HttpResponse(json.dumps({'message': 'internal matcher done, check the archive for results'}), content_type="application/json; charset=utf-8") else: response['error'] = 'Unknown function' elif request.method == 'GET': data = request.GET if 'api_key' not in data: response['error'] = 'Missing API key' return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8") elif data['api_key'] != crawler_settings.api_key: response['error'] = 'Incorrect API key' return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8") # send some 'ok' back else: if 'gc' in data: args = data.copy() for k in gallery_filter_keys: if k not in args: args[k] = '' keys = ("sort", "asc_desc") for k in keys: if k not in args: args[k] = '' # args = data # Already authorized by api key. args['public'] = False results = filter_galleries_no_request(args) if not results: return HttpResponse(json.dumps([]), content_type="application/json; charset=utf-8") response_text = json.dumps( [{ 'gid': gallery.gid, 'token': gallery.token, 'title': gallery.title, 'title_jpn': gallery.title_jpn, 'category': gallery.category, 'uploader': gallery.uploader, 'comment': gallery.comment, 'posted': int(timestamp_or_zero(gallery.posted)), 'filecount': gallery.filecount, 'filesize': gallery.filesize, 'expunged': gallery.expunged, 'rating': gallery.rating, 'hidden': gallery.hidden, 'fjord': gallery.fjord, 'public': gallery.public, 'provider': gallery.provider, 'dl_type': gallery.dl_type, 'tags': gallery.tag_list(), 'link': gallery.get_link(), 'thumbnail': request.build_absolute_uri(reverse('viewer:gallery-thumb', args=(gallery.pk,))) if gallery.thumbnail else '', 'thumbnail_url': gallery.thumbnail_url } for gallery in results ], # indent=2, sort_keys=True, ensure_ascii=False, ) return HttpResponse(response_text, content_type="application/json; charset=utf-8") else: response['error'] = 'Unknown function' else: response['error'] = 'Unsupported method: {}'.format(request.method) return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8")
required=False, action='store_true', default=False, help='Run the server as a daemon.') parser.add_argument('-pf', '--pidfile', required=False, action='store', default=None, help='Store the process id in the given file.') args = parser.parse_args() if args.config_dir: crawler_settings = Settings(load_from_disk=True, default_dir=args.config_dir) os.environ['PANDA_CONFIG_DIR'] = args.config_dir else: crawler_settings = Settings(load_from_disk=True) if args.port: cherrypy_port = args.port else: cherrypy_port = crawler_settings.webserver.bind_port cherrypy_settings = { 'server.socket_host': crawler_settings.webserver.bind_address, 'server.socket_port': cherrypy_port, 'checker.on': False, 'engine.autoreload.on': crawler_settings.cherrypy_auto_restart, 'log.screen': crawler_settings.webserver.log_to_screen,
def test_fakku_parser(self): """Test FAKKU gallery page parser""" settings = Settings(load_from_disk=True) gallery_link = 'https://www.fakku.net/hentai/im-a-piece-of-junk-sexaroid-english' parser = FakkuParser(settings) data = parser.fetch_gallery_data(gallery_link) expected_data = GalleryData( 'hentai/im-a-piece-of-junk-sexaroid-english', 'fakku', link=gallery_link, title='I\'m a Piece of Junk Sexaroid', thumbnail_url= 'https://t.fakku.net/images/manga/i/im-a-piece-of-junk-sexaroid-english/thumbs/002.thumb.jpg', filecount=16, category='Manga', tags=[ 'artist:wakame-san', 'magazine:comic_kairakuten_beast_2017-05', 'publisher:fakku', 'language:english', 'tsundere', 'femdom', 'vanilla', 'b*****b', 'oppai', 'hentai', 'creampie', 'uncensored', 'x-ray', 'subscription', ], comment='Plump slacker sex robot ❤', ) self.assertEqual(data, expected_data) gallery_link = 'https://www.fakku.net/hentai/tsf-story-append-20-english_1497401155' parser = FakkuParser(settings) data = parser.fetch_gallery_data(gallery_link) expected_data = GalleryData( 'hentai/tsf-story-append-20-english_1497401155', 'fakku', link=gallery_link, title='TSF Story Append 2.0', filecount=82, category='Doujinshi', tags=[ 'artist:oda_non', 'artist:yasui_riosuke', 'artist:meme50', 'artist:kojima_saya', 'artist:butcha-u', 'artist:mizuryu_kei', 'artist:kurenai_yuuji', 'artist:soine', 'artist:asanagi', 'artist:yumeno_tanuki', 'artist:hiroyuki_sanadura', 'artist:shindo_l', 'artist:naokame', 'artist:kin_no_hiyoko', 'artist:masaru_yajiro', 'group:da_hootch', 'publisher:enshodo', 'language:english', 'anal', 'b*****b', 'oppai', 'glasses', 'stockings', 'group', 'nurse', 'hentai', 'ahegao', 'creampie', 'uncensored', 'genderbend', 'doujin', ], comment= "Takumi's life as a girl only continues to get more wild, as he (she?) continues to fall deeper into a life of promiscuity, drugs and unprotected sex with strangers. Will his friend Ryou be able to pull him out of this terrible spiral?", thumbnail_url= 'https://t.fakku.net/images/manga/t/tsf-story-append-20-english_1497401155_1502575464/thumbs/001.thumb.jpg', ) self.assertEqual(data, expected_data)
def crawler(request: HttpRequest) -> HttpResponse: """Crawl given URLs.""" if not request.user.is_staff: return render_error(request, "You need to be an admin to crawl a link.") d = {} p = request.POST if p: if 'keep_this_settings' in p: current_settings = crawler_settings else: current_settings = Settings( load_from_config=crawler_settings.config) url_set = set() # create dictionary of properties for each archive current_settings.replace_metadata = False current_settings.config['allowed']['replace_metadata'] = 'no' for k, v in p.items(): if k.startswith("downloaders"): k, dl = k.split('-') current_settings.config['downloaders'][dl] = v current_settings.downloaders[dl] = int(v) elif k == "replace_metadata": current_settings.config['allowed'][k] = 'yes' current_settings.replace_metadata = True elif k == "urls": url_list = v.split("\n") for item in url_list: url_set.add(item.rstrip('\r')) urls = list(url_set) if 'reason' in p and p['reason'] != '': reason = p['reason'] # Force limit string length (reason field max_length) current_settings.archive_reason = reason[:200] current_settings.gallery_reason = reason[:200] if 'keep_this_settings' in p: current_settings.write() current_settings.load_config_from_file() if 'run_separate' in p: crawler_thread = CrawlerThread(crawler_logger, current_settings, urls) crawler_thread.start() else: current_settings.workers.web_queue.enqueue_args_list( urls, override_options=current_settings) messages.success(request, 'Starting Crawler, check the logs for a report.') # Not really optimal when there's many commands being queued # for command in url_list: # messages.success(request, command) return HttpResponseRedirect(reverse('viewer:main-page')) d.update({ 'settings': crawler_settings, 'downloaders': crawler_settings.provider_context.get_downloaders_name_priority( crawler_settings) }) return render(request, "viewer/crawler.html", d)
def foldercrawler(request: HttpRequest) -> HttpResponse: """Folder crawler.""" if not request.user.is_staff: return render_error(request, "You need to be an admin to use the tools.") d = {'media_root': os.path.realpath(crawler_settings.MEDIA_ROOT)} p = request.POST if p: if 'keep_this_settings' in p: current_settings = crawler_settings else: current_settings = Settings( load_from_config=crawler_settings.config) commands = set() # create dictionary of properties for each command for k, v in p.items(): if k.startswith("matchers"): k, matcher = k.split('-') current_settings.config['matchers'][matcher] = v current_settings.matchers[matcher] = int(v) elif k == "commands": command_list = v.split("\n") for item in command_list: commands.add(item.rstrip('\r')) elif k == "internal_matches": current_settings.internal_matches_for_non_matches = True if 'reason' in p and p['reason'] != '': reason = p['reason'] # Force limit string length (reason field max_length) current_settings.archive_reason = reason[:200] current_settings.gallery_reason = reason[:200] if 'source' in p and p['source'] != '': source = p['source'] # Force limit string length (reason field max_length) current_settings.archive_source = source[:50] if 'keep_this_settings' in p: current_settings.write() current_settings.load_config_from_file() folder_crawler = FolderCrawlerThread(folder_logger, current_settings, list(commands)) folder_crawler.start() messages.success( request, 'Starting Folder Crawler, check the logs for a report.') # Not really optimal when there's many commands being queued # for command in commands: # messages.success(request, command) return HttpResponseRedirect(reverse('viewer:main-page')) d.update({ 'settings': crawler_settings, 'matchers': crawler_settings.provider_context.get_matchers_name_priority( crawler_settings) }) return render(request, "viewer/foldercrawler.html", d)
def process_downloaded_archive(self, archive: Archive) -> None: if os.path.isfile(archive.zipped.path): except_at_open = False return_error = None try: my_zip = ZipFile( archive.zipped.path, 'r') return_error = my_zip.testzip() my_zip.close() except (BadZipFile, NotImplementedError): except_at_open = True if except_at_open or return_error: if 'panda' in archive.source_type: self.logger.error( "For archive: {}, file check on downloaded zipfile failed on file: {}, " "forcing download as panda_archive to fix it.".format(archive, archive.zipped.path) ) crc32 = calc_crc32( archive.zipped.path) Archive.objects.add_or_update_from_values({'crc32': crc32}, pk=archive.pk) if self.web_queue and archive.gallery: temp_settings = Settings(load_from_config=self.settings.config) temp_settings.allow_downloaders_only(['panda_archive'], True, True, True) self.web_queue.enqueue_args_list((archive.gallery.get_link(),), override_options=temp_settings) return else: self.logger.warning( "For archive: {}, File check on downloaded zipfile: {}. " "Check the file manually.".format(archive, archive.zipped.path) ) crc32 = calc_crc32( archive.zipped.path) filesize = get_zip_filesize( archive.zipped.path) filecount = filecount_in_zip( archive.zipped.path) values = {'crc32': crc32, 'filesize': filesize, 'filecount': filecount, } updated_archive = Archive.objects.add_or_update_from_values( values, pk=archive.pk) if archive.gallery and updated_archive.filesize != updated_archive.gallery.filesize: if Archive.objects.filter(gallery=updated_archive.gallery, filesize=updated_archive.gallery.filesize): self.logger.info( "For archive: {} size does not match gallery, " "but there's already another archive that matches.".format(updated_archive) ) return if 'panda' in archive.source_type: self.logger.info( "For archive: {} size does not match gallery, " "downloading again from panda_archive.".format(updated_archive) ) if self.web_queue: temp_settings = Settings(load_from_config=self.settings.config) temp_settings.allow_downloaders_only(['panda_archive'], True, True, True) self.web_queue.enqueue_args_list( (updated_archive.gallery.get_link(), ), override_options=temp_settings ) else: self.logger.warning( "For archive: {} size does not match gallery. Check the file manually.".format(archive) )