def foldercrawler(request: HttpRequest) -> HttpResponse: """Folder crawler.""" if not request.user.is_staff: return render_error(request, "You need to be an admin to use the tools.") d: dict[str, typing.Any] = {'media_root': os.path.realpath(crawler_settings.MEDIA_ROOT)} p = request.POST if p: if 'keep_this_settings' in p: current_settings = crawler_settings else: current_settings = Settings(load_from_config=crawler_settings.config) commands = set() # create dictionary of properties for each command for k, v in p.items(): if k.startswith("matchers"): k, matcher = k.split('-') current_settings.config['matchers'][matcher] = v current_settings.matchers[matcher] = int(v) elif k == "commands": command_list = v.split("\n") for item in command_list: commands.add(item.rstrip('\r')) elif k == "internal_matches": current_settings.internal_matches_for_non_matches = True if 'reason' in p and p['reason'] != '': reason = p['reason'] # Force limit string length (reason field max_length) current_settings.archive_reason = reason[:200] current_settings.gallery_reason = reason[:200] if 'source' in p and p['source'] != '': source = p['source'] # Force limit string length (reason field max_length) current_settings.archive_source = source[:50] if 'keep_this_settings' in p: current_settings.write() current_settings.load_config_from_file() folder_crawler = FolderCrawlerThread(current_settings, list(commands)) folder_crawler.start() messages.success(request, 'Starting Folder Crawler, check the logs for a report.') # Not really optimal when there's many commands being queued # for command in commands: # messages.success(request, command) return HttpResponseRedirect(reverse('viewer:main-page')) d.update({ 'settings': crawler_settings, 'matchers': crawler_settings.provider_context.get_matchers_name_priority(crawler_settings) }) return render(request, "viewer/foldercrawler.html", d)
def crawler(request: HttpRequest) -> HttpResponse: """Crawl given URLs.""" if not request.user.is_staff: return render_error(request, "You need to be an admin to crawl a link.") d = {} p = request.POST if p: if 'keep_this_settings' in p: current_settings = crawler_settings else: current_settings = Settings( load_from_config=crawler_settings.config) url_set = set() # create dictionary of properties for each archive current_settings.replace_metadata = False current_settings.config['allowed']['replace_metadata'] = 'no' for k, v in p.items(): if k.startswith("downloaders"): k, dl = k.split('-') current_settings.config['downloaders'][dl] = v current_settings.downloaders[dl] = int(v) elif k == "replace_metadata": current_settings.config['allowed'][k] = 'yes' current_settings.replace_metadata = True elif k == "urls": url_list = v.split("\n") for item in url_list: url_set.add(item.rstrip('\r')) urls = list(url_set) if 'reason' in p and p['reason'] != '': reason = p['reason'] # Force limit string length (reason field max_length) current_settings.archive_reason = reason[:200] current_settings.gallery_reason = reason[:200] if 'keep_this_settings' in p: current_settings.write() current_settings.load_config_from_file() if 'run_separate' in p: crawler_thread = CrawlerThread(crawler_logger, current_settings, urls) crawler_thread.start() else: current_settings.workers.web_queue.enqueue_args_list( urls, override_options=current_settings) messages.success(request, 'Starting Crawler, check the logs for a report.') # Not really optimal when there's many commands being queued # for command in url_list: # messages.success(request, command) return HttpResponseRedirect(reverse('viewer:main-page')) d.update({ 'settings': crawler_settings, 'downloaders': crawler_settings.provider_context.get_downloaders_name_priority( crawler_settings) }) return render(request, "viewer/crawler.html", d)
def user_crawler(request: HttpRequest) -> HttpResponse: """Crawl given URLs.""" d = {} p = request.POST all_downloaders = crawler_settings.provider_context.get_downloaders_name_priority( crawler_settings, filter_name='generic_') # providers_not_generic = list(set([x[0].provider for x in all_downloaders if not x[0].provider.is_generic()])) generic_downloaders = [x[0] for x in all_downloaders] user_reason = p.get('reason', '') if p: current_settings = Settings(load_from_config=crawler_settings.config) if not current_settings.workers.web_queue: messages.error( request, 'Cannot submit links currently. Please contact an admin.') return HttpResponseRedirect(request.META["HTTP_REFERER"]) url_set = set() # create dictionary of properties for each archive current_settings.replace_metadata = False current_settings.config['allowed']['replace_metadata'] = 'no' for k, v in p.items(): if k == "downloader": if v == 'no-generic': continue elif v in generic_downloaders: current_settings.enable_downloader_only(v) elif k == "urls": url_list = v.split("\n") for item in url_list: url_set.add(item.rstrip('\r')) urls = list(url_set) if not urls: messages.error(request, 'Submission is empty.') return HttpResponseRedirect(request.META["HTTP_REFERER"]) if 'reason' in p and p['reason'] != '': reason = p['reason'] # Force limit string length (reason field max_length) current_settings.archive_reason = reason[:200] current_settings.gallery_reason = reason[:200] if 'source' in p and p['source'] != '': source = p['source'] # Force limit string length (reason field max_length) current_settings.archive_source = source[:50] current_settings.archive_user = request.user parsers = crawler_settings.provider_context.get_parsers_classes() def archive_callback(x: Optional['Archive'], crawled_url: Optional[str], result: str) -> None: event_log(request.user, 'ADD_ARCHIVE', reason=user_reason, content_object=x, result=result, data=crawled_url) def gallery_callback(x: Optional['Gallery'], crawled_url: Optional[str], result: str) -> None: event_log(request.user, 'ADD_GALLERY', reason=user_reason, content_object=x, result=result, data=crawled_url) current_settings.workers.web_queue.enqueue_args_list( urls, override_options=current_settings, archive_callback=archive_callback, gallery_callback=gallery_callback, use_argparser=False) messages.success( request, 'Starting Crawler, if the links were correctly added, they should appear on the archive or gallery list.' ) for url in urls: frontend_logger.info("User {}: queued link: {}".format( request.user.username, url)) # event_log( # request.user, # 'CRAWL_URL', # reason=user_reason, # data=url, # result='queue' # ) found_valid_urls: List[str] = [] for parser in parsers: if parser.id_from_url_implemented(): urls_filtered = parser.filter_accepted_urls(urls) found_valid_urls.extend(urls_filtered) for url_filtered in urls_filtered: gid = parser.id_from_url(url_filtered) gallery = Gallery.objects.filter(gid=gid).first() if not gallery: messages.success( request, '{}: New URL, will be added to the submit queue'. format(url_filtered)) event_log(request.user, 'CRAWL_URL', reason=user_reason, data=url_filtered, result='queued') continue if gallery.is_submitted(): messages.info( request, '{}: Already in submit queue, link: {}, reason: {}' .format(url_filtered, gallery.get_absolute_url(), gallery.reason)) event_log(request.user, 'CRAWL_URL', reason=user_reason, data=url_filtered, result='already_submitted') elif gallery.public: messages.info( request, '{}: Already present, is public: {}'.format( url_filtered, request.build_absolute_uri( gallery.get_absolute_url()))) event_log(request.user, 'CRAWL_URL', reason=user_reason, data=url_filtered, result='already_public') else: messages.info( request, '{}: Already present, is not public: {}'.format( url_filtered, request.build_absolute_uri( gallery.get_absolute_url()))) event_log(request.user, 'CRAWL_URL', reason=user_reason, data=url_filtered, result='already_private') extra_urls = [x for x in urls if x not in found_valid_urls] for extra_url in extra_urls: messages.info(request, '{}: Extra non-provider URLs'.format(extra_url)) event_log(request.user, 'CRAWL_URL', reason=user_reason, data=extra_url, result='queued') # Not really optimal when there's many commands being queued # for command in url_list: # messages.success(request, command) return HttpResponseRedirect(request.META["HTTP_REFERER"]) d.update({'downloaders': generic_downloaders}) return render(request, "viewer/collaborators/gallery_crawler.html", d)
def missing_archives_for_galleries(request: HttpRequest) -> HttpResponse: p = request.POST get = request.GET title = get.get("title", '') tags = get.get("tags", '') try: page = int(get.get("page", '1')) except ValueError: page = 1 if 'clear' in get: form = GallerySearchForm() else: form = GallerySearchForm(initial={'title': title, 'tags': tags}) if p and request.user.is_staff: pks = [] for k, v in p.items(): if k.startswith("sel-"): # k, pk = k.split('-') # results[pk][k] = v pks.append(v) results = Gallery.objects.filter(id__in=pks).order_by('-create_date') if 'delete_galleries' in p: for gallery in results: message = 'Removing gallery: {}, link: {}'.format( gallery.title, gallery.get_link()) frontend_logger.info(message) messages.success(request, message) gallery.mark_as_deleted() elif 'download_galleries' in p: for gallery in results: message = 'Queueing gallery: {}, link: {}'.format( gallery.title, gallery.get_link()) frontend_logger.info(message) messages.success(request, message) # Force replace_metadata when queueing from this list, since it's mostly used to download non used. current_settings = Settings( load_from_config=crawler_settings.config) if current_settings.workers.web_queue: current_settings.replace_metadata = True current_settings.retry_failed = True if 'reason' in p and p['reason'] != '': reason = p['reason'] # Force limit string length (reason field max_length) current_settings.archive_reason = reason[:200] current_settings.archive_details = gallery.reason current_settings.gallery_reason = reason[:200] elif gallery.reason: current_settings.archive_reason = gallery.reason current_settings.workers.web_queue.enqueue_args_list( (gallery.get_link(), ), override_options=current_settings) elif 'recall_api' in p: message = 'Recalling API for {} galleries'.format(results.count()) frontend_logger.info(message) messages.success(request, message) gallery_links = [x.get_link() for x in results] gallery_providers = list( results.values_list('provider', flat=True).distinct()) current_settings = Settings( load_from_config=crawler_settings.config) if current_settings.workers.web_queue: current_settings.set_update_metadata_options( providers=gallery_providers) current_settings.workers.web_queue.enqueue_args_list( gallery_links, override_options=current_settings) if 'force_public' in request.GET: force_public = True else: force_public = False if request.user.is_staff and not force_public: providers = Gallery.objects.all().values_list('provider', flat=True).distinct() params = {} for k, v in get.items(): params[k] = v for k in gallery_filter_keys: if k not in params: params[k] = '' results = filter_galleries_simple(params) results = results.non_used_galleries().prefetch_related( 'foundgallery_set') paginator = Paginator(results, 50) try: results = paginator.page(page) except (InvalidPage, EmptyPage): results = paginator.page(paginator.num_pages) d = { 'results': results, 'providers': providers, 'force_public': force_public, 'form': form } else: params = {} for k, v in get.items(): params[k] = v for k in gallery_filter_keys: if k not in params: params[k] = '' results = filter_galleries_simple(params) results = results.non_used_galleries(public=True, provider__in=['panda', 'fakku']) d = {'results': results} return render(request, "viewer/archives_missing_for_galleries.html", d)
def submit_queue(request: HttpRequest) -> HttpResponse: p = request.POST get = request.GET title = get.get("title", '') tags = get.get("tags", '') user_reason = p.get('reason', '') try: page = int(get.get("page", '1')) except ValueError: page = 1 if 'clear' in get: form = GallerySearchForm() else: form = GallerySearchForm(initial={'title': title, 'tags': tags}) if p: pks = [] for k, v in p.items(): if k.startswith("sel-"): # k, pk = k.split('-') # results[pk][k] = v pks.append(v) preserved = Case( *[When(pk=pk, then=pos) for pos, pk in enumerate(pks)]) if 'denied' in get: results = Gallery.objects.submitted_galleries( id__in=pks).order_by(preserved) else: results = Gallery.objects.submitted_galleries( ~Q(status=Gallery.DENIED), id__in=pks).order_by(preserved) if 'deny_galleries' in p: for gallery in results: message = 'Denying gallery: {}, link: {}, source link: {}'.format( gallery.title, gallery.get_absolute_url(), gallery.get_link()) if 'reason' in p and p['reason'] != '': message += ', reason: {}'.format(p['reason']) frontend_logger.info("User {}: {}".format( request.user.username, message)) messages.success(request, message) gallery.mark_as_denied() event_log(request.user, 'DENY_GALLERY', reason=user_reason, content_object=gallery, result='denied') elif 'download_galleries' in p: for gallery in results: message = 'Queueing gallery: {}, link: {}, source link: {}'.format( gallery.title, gallery.get_absolute_url(), gallery.get_link()) if 'reason' in p and p['reason'] != '': message += ', reason: {}'.format(p['reason']) frontend_logger.info("User {}: {}".format( request.user.username, message)) messages.success(request, message) event_log(request.user, 'ACCEPT_GALLERY', reason=user_reason, content_object=gallery, result='accepted') # Force replace_metadata when queueing from this list, since it's mostly used to download non used. current_settings = Settings( load_from_config=crawler_settings.config) if current_settings.workers.web_queue: current_settings.replace_metadata = True current_settings.retry_failed = True if 'reason' in p and p['reason'] != '': reason = p['reason'] # Force limit string length (reason field max_length) current_settings.archive_reason = reason[:200] current_settings.archive_details = gallery.reason current_settings.gallery_reason = reason[:200] elif gallery.reason: current_settings.archive_reason = gallery.reason def archive_callback(x: Optional['Archive'], crawled_url: Optional[str], result: str) -> None: event_log(request.user, 'ADD_ARCHIVE', reason=user_reason, content_object=x, result=result, data=crawled_url) def gallery_callback(x: Optional['Gallery'], crawled_url: Optional[str], result: str) -> None: event_log(request.user, 'ADD_GALLERY', reason=user_reason, content_object=x, result=result, data=crawled_url) current_settings.workers.web_queue.enqueue_args_list( (gallery.get_link(), ), override_options=current_settings, archive_callback=archive_callback, gallery_callback=gallery_callback, ) providers = Gallery.objects.all().values_list('provider', flat=True).distinct() params = {} for k, v in get.items(): params[k] = v for k in gallery_filter_keys: if k not in params: params[k] = '' results = filter_galleries_simple(params) if 'denied' in get: results = results.submitted_galleries().prefetch_related( 'foundgallery_set') else: results = results.submitted_galleries(~Q( status=Gallery.DENIED)).prefetch_related('foundgallery_set') paginator = Paginator(results, 50) try: results = paginator.page(page) except (InvalidPage, EmptyPage): results = paginator.page(paginator.num_pages) d = {'results': results, 'providers': providers, 'form': form} return render(request, "viewer/collaborators/submit_queue.html", d)