Esempio n. 1
0
def foldercrawler(request: HttpRequest) -> HttpResponse:
    """Folder crawler."""
    if not request.user.is_staff:
        return render_error(request, "You need to be an admin to use the tools.")

    d: dict[str, typing.Any] = {'media_root': os.path.realpath(crawler_settings.MEDIA_ROOT)}

    p = request.POST

    if p:
        if 'keep_this_settings' in p:
            current_settings = crawler_settings
        else:
            current_settings = Settings(load_from_config=crawler_settings.config)
        commands = set()
        # create dictionary of properties for each command
        for k, v in p.items():
            if k.startswith("matchers"):
                k, matcher = k.split('-')
                current_settings.config['matchers'][matcher] = v
                current_settings.matchers[matcher] = int(v)
            elif k == "commands":
                command_list = v.split("\n")
                for item in command_list:
                    commands.add(item.rstrip('\r'))
            elif k == "internal_matches":
                current_settings.internal_matches_for_non_matches = True

        if 'reason' in p and p['reason'] != '':
            reason = p['reason']
            # Force limit string length (reason field max_length)
            current_settings.archive_reason = reason[:200]
            current_settings.gallery_reason = reason[:200]

        if 'source' in p and p['source'] != '':
            source = p['source']
            # Force limit string length (reason field max_length)
            current_settings.archive_source = source[:50]

        if 'keep_this_settings' in p:
            current_settings.write()
            current_settings.load_config_from_file()
        folder_crawler = FolderCrawlerThread(current_settings, list(commands))
        folder_crawler.start()
        messages.success(request, 'Starting Folder Crawler, check the logs for a report.')
        # Not really optimal when there's many commands being queued
        # for command in commands:
        #     messages.success(request, command)
        return HttpResponseRedirect(reverse('viewer:main-page'))

    d.update({
        'settings': crawler_settings,
        'matchers': crawler_settings.provider_context.get_matchers_name_priority(crawler_settings)
    })

    return render(request, "viewer/foldercrawler.html", d)
Esempio n. 2
0
def user_crawler(request: HttpRequest) -> HttpResponse:
    """Crawl given URLs."""

    d = {}

    p = request.POST

    all_downloaders = crawler_settings.provider_context.get_downloaders_name_priority(
        crawler_settings, filter_name='generic_')

    # providers_not_generic = list(set([x[0].provider for x in all_downloaders if not x[0].provider.is_generic()]))
    generic_downloaders = [x[0] for x in all_downloaders]

    user_reason = p.get('reason', '')

    if p:
        current_settings = Settings(load_from_config=crawler_settings.config)
        if not current_settings.workers.web_queue:
            messages.error(
                request,
                'Cannot submit links currently. Please contact an admin.')
            return HttpResponseRedirect(request.META["HTTP_REFERER"])
        url_set = set()
        # create dictionary of properties for each archive
        current_settings.replace_metadata = False
        current_settings.config['allowed']['replace_metadata'] = 'no'
        for k, v in p.items():
            if k == "downloader":
                if v == 'no-generic':
                    continue
                elif v in generic_downloaders:
                    current_settings.enable_downloader_only(v)
            elif k == "urls":
                url_list = v.split("\n")
                for item in url_list:
                    url_set.add(item.rstrip('\r'))
        urls = list(url_set)

        if not urls:
            messages.error(request, 'Submission is empty.')
            return HttpResponseRedirect(request.META["HTTP_REFERER"])

        if 'reason' in p and p['reason'] != '':
            reason = p['reason']
            # Force limit string length (reason field max_length)
            current_settings.archive_reason = reason[:200]
            current_settings.gallery_reason = reason[:200]
        if 'source' in p and p['source'] != '':
            source = p['source']
            # Force limit string length (reason field max_length)
            current_settings.archive_source = source[:50]

        current_settings.archive_user = request.user

        parsers = crawler_settings.provider_context.get_parsers_classes()

        def archive_callback(x: Optional['Archive'],
                             crawled_url: Optional[str], result: str) -> None:
            event_log(request.user,
                      'ADD_ARCHIVE',
                      reason=user_reason,
                      content_object=x,
                      result=result,
                      data=crawled_url)

        def gallery_callback(x: Optional['Gallery'],
                             crawled_url: Optional[str], result: str) -> None:
            event_log(request.user,
                      'ADD_GALLERY',
                      reason=user_reason,
                      content_object=x,
                      result=result,
                      data=crawled_url)

        current_settings.workers.web_queue.enqueue_args_list(
            urls,
            override_options=current_settings,
            archive_callback=archive_callback,
            gallery_callback=gallery_callback,
            use_argparser=False)

        messages.success(
            request,
            'Starting Crawler, if the links were correctly added, they should appear on the archive or gallery list.'
        )
        for url in urls:
            frontend_logger.info("User {}: queued link: {}".format(
                request.user.username, url))
            # event_log(
            #     request.user,
            #     'CRAWL_URL',
            #     reason=user_reason,
            #     data=url,
            #     result='queue'
            # )

        found_valid_urls: List[str] = []

        for parser in parsers:
            if parser.id_from_url_implemented():
                urls_filtered = parser.filter_accepted_urls(urls)
                found_valid_urls.extend(urls_filtered)
                for url_filtered in urls_filtered:
                    gid = parser.id_from_url(url_filtered)
                    gallery = Gallery.objects.filter(gid=gid).first()
                    if not gallery:
                        messages.success(
                            request,
                            '{}: New URL, will be added to the submit queue'.
                            format(url_filtered))
                        event_log(request.user,
                                  'CRAWL_URL',
                                  reason=user_reason,
                                  data=url_filtered,
                                  result='queued')
                        continue
                    if gallery.is_submitted():
                        messages.info(
                            request,
                            '{}: Already in submit queue, link: {}, reason: {}'
                            .format(url_filtered, gallery.get_absolute_url(),
                                    gallery.reason))
                        event_log(request.user,
                                  'CRAWL_URL',
                                  reason=user_reason,
                                  data=url_filtered,
                                  result='already_submitted')
                    elif gallery.public:
                        messages.info(
                            request,
                            '{}: Already present, is public: {}'.format(
                                url_filtered,
                                request.build_absolute_uri(
                                    gallery.get_absolute_url())))
                        event_log(request.user,
                                  'CRAWL_URL',
                                  reason=user_reason,
                                  data=url_filtered,
                                  result='already_public')
                    else:
                        messages.info(
                            request,
                            '{}: Already present, is not public: {}'.format(
                                url_filtered,
                                request.build_absolute_uri(
                                    gallery.get_absolute_url())))
                        event_log(request.user,
                                  'CRAWL_URL',
                                  reason=user_reason,
                                  data=url_filtered,
                                  result='already_private')

        extra_urls = [x for x in urls if x not in found_valid_urls]

        for extra_url in extra_urls:
            messages.info(request,
                          '{}: Extra non-provider URLs'.format(extra_url))
            event_log(request.user,
                      'CRAWL_URL',
                      reason=user_reason,
                      data=extra_url,
                      result='queued')
        # Not really optimal when there's many commands being queued
        # for command in url_list:
        #     messages.success(request, command)
        return HttpResponseRedirect(request.META["HTTP_REFERER"])

    d.update({'downloaders': generic_downloaders})

    return render(request, "viewer/collaborators/gallery_crawler.html", d)