Example #1
0
    def job(self) -> None:
        while not self.stop.is_set():

            seconds_to_wait = self.wait_until_next_run()
            if self.stop.wait(timeout=seconds_to_wait):
                return

            if self.settings.autoupdater.enable:
                current_settings = Settings(load_from_config=self.settings.config)
                current_settings.keep_dl_type = True
                current_settings.silent_processing = True
                current_settings.config['allowed']['replace_metadata'] = 'yes'

                connection.close()

                start_date = django_tz.now() - timedelta(seconds=int(self.timer)) - timedelta(days=self.settings.autoupdater.buffer_back)
                end_date = django_tz.now() - timedelta(days=self.settings.autoupdater.buffer_after)
                to_update_providers = current_settings.autoupdater.providers

                galleries = Gallery.objects.eligible_for_use(
                    posted__gte=start_date,
                    posted__lte=end_date,
                    provider__in=to_update_providers
                )

                if not galleries:
                    logger.info(
                        "No galleries posted from {} to {} need updating. Providers: {}".format(
                            start_date,
                            end_date,
                            ", ".join(to_update_providers)
                        )
                    )
                else:
                    # Leave only info downloaders, then leave only enabled auto updated providers
                    downloaders = current_settings.provider_context.get_downloaders_name_priority(current_settings, filter_name='info')
                    downloaders_names = [x[0] for x in downloaders if x[0].replace("_info", "") in to_update_providers]

                    current_settings.allow_downloaders_only(downloaders_names, True, True, True)

                    url_list = [x.get_link() for x in galleries]

                    logger.info(
                        "Starting timed auto updater, updating {} galleries "
                        "posted from {} to {}. Providers: {}".format(
                            len(url_list),
                            start_date,
                            end_date,
                            ", ".join(to_update_providers)
                        )
                    )

                    url_list.append('--update-mode')

                    self.web_queue.enqueue_args_list(url_list, override_options=current_settings)

            self.update_last_run(django_tz.now())
Example #2
0
 def process_downloaded_archive(self, archive: Archive) -> None:
     if os.path.isfile(archive.zipped.path):
         except_at_open = False
         return_error = None
         try:
             my_zip = ZipFile(
                 archive.zipped.path, 'r')
             return_error = my_zip.testzip()
             my_zip.close()
         except (BadZipFile, NotImplementedError):
             except_at_open = True
         if except_at_open or return_error:
             if 'panda' in archive.source_type:
                 self.logger.error(
                     "For archive: {}, file check on downloaded zipfile failed on file: {}, "
                     "forcing download as panda_archive to fix it.".format(archive, archive.zipped.path)
                 )
                 crc32 = calc_crc32(
                     archive.zipped.path)
                 Archive.objects.add_or_update_from_values({'crc32': crc32}, pk=archive.pk)
                 if self.web_queue and archive.gallery:
                     temp_settings = Settings(load_from_config=self.settings.config)
                     temp_settings.allow_downloaders_only(['panda_archive'], True, True, True)
                     self.web_queue.enqueue_args_list((archive.gallery.get_link(),), override_options=temp_settings)
                     return
             else:
                 self.logger.warning(
                     "For archive: {}, File check on downloaded zipfile: {}. "
                     "Check the file manually.".format(archive, archive.zipped.path)
                 )
         crc32 = calc_crc32(
             archive.zipped.path)
         filesize = get_zip_filesize(
             archive.zipped.path)
         filecount = filecount_in_zip(
             archive.zipped.path)
         values = {'crc32': crc32,
                   'filesize': filesize,
                   'filecount': filecount,
                   }
         updated_archive = Archive.objects.add_or_update_from_values(
             values, pk=archive.pk)
         if archive.gallery and updated_archive.filesize != updated_archive.gallery.filesize:
             if Archive.objects.filter(gallery=updated_archive.gallery, filesize=updated_archive.gallery.filesize):
                 self.logger.info(
                     "For archive: {} size does not match gallery, "
                     "but there's already another archive that matches.".format(updated_archive)
                 )
                 return
             if 'panda' in archive.source_type:
                 self.logger.info(
                     "For archive: {} size does not match gallery, "
                     "downloading again from panda_archive.".format(updated_archive)
                 )
                 if self.web_queue:
                     temp_settings = Settings(load_from_config=self.settings.config)
                     temp_settings.allow_downloaders_only(['panda_archive'], True, True, True)
                     self.web_queue.enqueue_args_list(
                         (updated_archive.gallery.get_link(), ),
                         override_options=temp_settings
                     )
             else:
                 self.logger.warning(
                     "For archive: {} size does not match gallery. Check the file manually.".format(archive)
                 )
Example #3
0
def json_parser(request: HttpRequest) -> HttpResponse:
    response = {}

    if request.method == 'POST':
        if not request.body:
            response['error'] = 'Empty request'
            return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8")
        data = json.loads(request.body.decode("utf-8"))
        if 'api_key' not in data:
            response['error'] = 'Missing API key'
            return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8")
        elif data['api_key'] != crawler_settings.api_key:
            response['error'] = 'Incorrect API key'
            return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8")
        # send some 'ok' back
        else:
            if 'operation' not in data or 'args' not in data:
                response['error'] = 'Wrong format'
            else:
                args = data['args']
                response = {}
                # Used by internal pages and userscript
                if data['operation'] == 'webcrawler' and 'link' in args:
                    if not crawler_settings.workers.web_queue:
                        response['error'] = 'The webqueue is not running'
                    elif 'downloader' in args:
                        current_settings = Settings(load_from_config=crawler_settings.config)
                        if not current_settings.workers.web_queue:
                            response['error'] = 'The webqueue is not running'
                        else:
                            current_settings.allow_downloaders_only([args['downloader']], True, True, True)
                            archive = None
                            parsers = current_settings.provider_context.get_parsers(current_settings, crawler_logger)
                            for parser in parsers:
                                if parser.id_from_url_implemented():
                                    urls_filtered = parser.filter_accepted_urls((args['link'], ))
                                    for url_filtered in urls_filtered:
                                        gallery_gid = parser.id_from_url(url_filtered)
                                        if gallery_gid:
                                            archive = Archive.objects.filter(gallery__gid=gallery_gid).first()
                                    if urls_filtered:
                                        break
                            current_settings.workers.web_queue.enqueue_args_list((args['link'],), override_options=current_settings)
                            if archive:
                                response['message'] = "Archive exists, crawling to check for redownload: " + args['link']
                            else:
                                response['message'] = "Crawling: " + args['link']
                    else:
                        if 'parentLink' in args:
                            parent_archive = None
                            parsers = crawler_settings.provider_context.get_parsers(crawler_settings, crawler_logger)
                            for parser in parsers:
                                if parser.id_from_url_implemented():
                                    urls_filtered = parser.filter_accepted_urls((args['parentLink'],))
                                    for url_filtered in urls_filtered:
                                        gallery_gid = parser.id_from_url(url_filtered)
                                        if gallery_gid:
                                            parent_archive = Archive.objects.filter(gallery__gid=gallery_gid).first()
                                    if urls_filtered:
                                        break
                            if parent_archive:
                                link = parent_archive.gallery.get_link()
                                if 'action' in args and args['action'] == 'replaceFound':
                                    parent_archive.gallery.mark_as_deleted()
                                    parent_archive.gallery = None
                                    parent_archive.delete_all_files()
                                    parent_archive.delete_files_but_archive()
                                    parent_archive.delete()
                                    response['message'] = "Crawling: " + args['link'] + ", deleting parent: " + link
                                    crawler_settings.workers.web_queue.enqueue_args(args['link'])
                                elif 'action' in args and args['action'] == 'queueFound':
                                    response['message'] = "Crawling: " + args['link'] + ", keeping parent: " + link
                                    crawler_settings.workers.web_queue.enqueue_args(args['link'])
                                else:
                                    response['message'] = "Please confirm deletion of parent: " + link
                                    response['action'] = 'confirmDeletion'
                            else:
                                archive = None
                                parsers = crawler_settings.provider_context.get_parsers(crawler_settings, crawler_logger)
                                for parser in parsers:
                                    if parser.id_from_url_implemented():
                                        urls_filtered = parser.filter_accepted_urls((args['link'],))
                                        for url_filtered in urls_filtered:
                                            gallery_gid = parser.id_from_url(url_filtered)
                                            if gallery_gid:
                                                archive = Archive.objects.filter(gallery__gid=gallery_gid).first()
                                        if urls_filtered:
                                            break
                                if archive:
                                    response['message'] = "Archive exists, crawling to check for redownload: " + args['link']
                                else:
                                    response['message'] = "Crawling: " + args['link']
                                crawler_settings.workers.web_queue.enqueue_args(args['link'])
                        else:
                            archive = None
                            parsers = crawler_settings.provider_context.get_parsers(crawler_settings, crawler_logger)
                            for parser in parsers:
                                if parser.id_from_url_implemented():
                                    urls_filtered = parser.filter_accepted_urls((args['link'],))
                                    for url_filtered in urls_filtered:
                                        gallery_gid = parser.id_from_url(url_filtered)
                                        if gallery_gid:
                                            archive = Archive.objects.filter(gallery__gid=gallery_gid).first()
                                    if urls_filtered:
                                        break
                            if archive:
                                response['message'] = "Archive exists, crawling to check for redownload: " + args['link']
                            else:
                                response['message'] = "Crawling: " + args['link']
                            crawler_settings.workers.web_queue.enqueue_args(args['link'])
                    if not response:
                        response['error'] = 'Could not parse request'
                    return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8")
                # Used by remotesite command
                elif data['operation'] == 'archive_request':
                    archives_query = Archive.objects.filter_non_existent(crawler_settings.MEDIA_ROOT, gallery__gid__in=args)
                    archives = [{'gid': archive.gallery.gid,
                                 'id': archive.id,
                                 'zipped': archive.zipped.name,
                                 'filesize': archive.filesize} for archive in archives_query]
                    response_text = json.dumps({'result': archives})
                    return HttpResponse(response_text, content_type="application/json; charset=utf-8")
                # Used by remotesite command
                elif data['operation'] in ('queue_archives', 'queue_galleries'):
                    urls = args
                    new_urls_set = set()
                    gids_set = set()

                    parsers = crawler_settings.provider_context.get_parsers(crawler_settings, crawler_logger)
                    for parser in parsers:
                        if parser.id_from_url_implemented():
                            urls_filtered = parser.filter_accepted_urls(urls)
                            for url in urls_filtered:
                                gid = parser.id_from_url(url)
                                gids_set.add(gid)

                    gids_list = list(gids_set)

                    existing_galleries = Gallery.objects.filter(gid__in=gids_list)
                    for gallery_object in existing_galleries:
                        if gallery_object.is_submitted():
                            gallery_object.delete()
                        # Delete queue galleries that failed, and does not have archives.
                        elif data['operation'] == 'queue_archives' and "failed" in gallery_object.dl_type and not gallery_object.archive_set.all():
                            gallery_object.delete()
                        elif data['operation'] == 'queue_archives' and not gallery_object.archive_set.all():
                            gallery_object.delete()
                    already_present_gids = list(Gallery.objects.filter(gid__in=gids_list).values_list('gid', flat=True))
                    # new_gids = list(gids_set - set(already_present_gids))

                    for parser in parsers:
                        if parser.id_from_url_implemented():
                            urls_filtered = parser.filter_accepted_urls(urls)
                            for url in urls_filtered:
                                gid = parser.id_from_url(url)
                                if gid not in already_present_gids:
                                    new_urls_set.add(url)

                    pages_links = list(new_urls_set)
                    if len(pages_links) > 0:
                        current_settings = Settings(load_from_config=crawler_settings.config)
                        if data['operation'] == 'queue_galleries':
                            current_settings.allow_type_downloaders_only('info')
                        elif data['operation'] == 'queue_archives':
                            if 'archive_reason' in data:
                                current_settings.archive_reason = data['archive_reason']
                            if 'archive_details' in data:
                                current_settings.archive_details = data['archive_details']
                            current_settings.allow_type_downloaders_only('fake')
                        if current_settings.workers.web_queue:
                            current_settings.workers.web_queue.enqueue_args_list(pages_links, override_options=current_settings)
                        else:
                            pages_links = []
                    return HttpResponse(json.dumps({'result': str(len(pages_links))}), content_type="application/json; charset=utf-8")
                # Used by remotesite command
                elif data['operation'] == 'links':
                    links = args
                    if len(links) > 0:
                        crawler_settings.workers.web_queue.enqueue_args_list(links)
                    return HttpResponse(json.dumps({'result': str(len(links))}), content_type="application/json; charset=utf-8")
                # Used by archive page
                elif data['operation'] == 'match_archive':
                    archive = Archive.objects.filter(pk=args['archive'])
                    if archive:
                        generate_possible_matches_for_archives(
                            archive,
                            filters=(args['match_filter'],),
                            logger=crawler_logger,
                            match_local=False,
                            match_web=True,
                        )
                    return HttpResponse(json.dumps({'message': 'web matcher done, check the logs for results'}),
                                        content_type="application/json; charset=utf-8")
                elif data['operation'] == 'match_archive_internally':
                    archive = Archive.objects.get(pk=args['archive'])
                    if archive:
                        clear_title = True if 'clear' in args else False
                        provider_filter = args.get('provider', '')
                        try:
                            cutoff = float(request.GET.get('cutoff', '0.4'))
                        except ValueError:
                            cutoff = 0.4
                        try:
                            max_matches = int(request.GET.get('max-matches', '10'))
                        except ValueError:
                            max_matches = 10

                        archive.generate_possible_matches(
                            clear_title=clear_title, provider_filter=provider_filter,
                            cutoff=cutoff, max_matches=max_matches
                        )
                        archive.save()
                    return HttpResponse(json.dumps({'message': 'internal matcher done, check the archive for results'}),
                                        content_type="application/json; charset=utf-8")
                else:
                    response['error'] = 'Unknown function'
    elif request.method == 'GET':
        data = request.GET
        if 'api_key' not in data:
            response['error'] = 'Missing API key'
            return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8")
        elif data['api_key'] != crawler_settings.api_key:
            response['error'] = 'Incorrect API key'
            return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8")
        # send some 'ok' back
        else:
            if 'gc' in data:
                args = data.copy()

                for k in gallery_filter_keys:
                    if k not in args:
                        args[k] = ''

                keys = ("sort", "asc_desc")

                for k in keys:
                    if k not in args:
                        args[k] = ''

                # args = data
                # Already authorized by api key.
                args['public'] = False

                results = filter_galleries_no_request(args)
                if not results:
                    return HttpResponse(json.dumps([]), content_type="application/json; charset=utf-8")
                response_text = json.dumps(
                    [{
                        'gid': gallery.gid,
                        'token': gallery.token,
                        'title': gallery.title,
                        'title_jpn': gallery.title_jpn,
                        'category': gallery.category,
                        'uploader': gallery.uploader,
                        'comment': gallery.comment,
                        'posted': int(timestamp_or_zero(gallery.posted)),
                        'filecount': gallery.filecount,
                        'filesize': gallery.filesize,
                        'expunged': gallery.expunged,
                        'rating': gallery.rating,
                        'hidden': gallery.hidden,
                        'fjord': gallery.fjord,
                        'public': gallery.public,
                        'provider': gallery.provider,
                        'dl_type': gallery.dl_type,
                        'tags': gallery.tag_list(),
                        'link': gallery.get_link(),
                        'thumbnail': request.build_absolute_uri(reverse('viewer:gallery-thumb', args=(gallery.pk,))) if gallery.thumbnail else '',
                        'thumbnail_url': gallery.thumbnail_url
                    } for gallery in results
                    ],
                    # indent=2,
                    sort_keys=True,
                    ensure_ascii=False,
                )
                return HttpResponse(response_text, content_type="application/json; charset=utf-8")
            else:
                response['error'] = 'Unknown function'
    else:
        response['error'] = 'Unsupported method: {}'.format(request.method)
    return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8")