Beispiel #1
0
    def format_match_values(self) -> Optional[DataDict]:

        if not self.match_values:
            return None
        self.match_gid = self.match_values.gid
        values = {
            'title':
            self.match_title,
            'title_jpn':
            '',
            'zipped':
            self.file_path,
            'crc32':
            self.crc32,
            'match_type':
            self.found_by,
            'filesize':
            get_zip_filesize(
                os.path.join(self.settings.MEDIA_ROOT, self.file_path)),
            'filecount':
            filecount_in_zip(
                os.path.join(self.settings.MEDIA_ROOT, self.file_path)),
            'source_type':
            self.provider
        }

        return values
Beispiel #2
0
 def process_downloaded_archive(self, archive: Archive) -> None:
     if os.path.isfile(archive.zipped.path):
         except_at_open = False
         return_error = None
         try:
             my_zip = ZipFile(
                 archive.zipped.path, 'r')
             return_error = my_zip.testzip()
             my_zip.close()
         except (BadZipFile, NotImplementedError):
             except_at_open = True
         if except_at_open or return_error:
             if 'panda' in archive.source_type:
                 self.logger.error(
                     "For archive: {}, file check on downloaded zipfile failed on file: {}, "
                     "forcing download as panda_archive to fix it.".format(archive, archive.zipped.path)
                 )
                 crc32 = calc_crc32(
                     archive.zipped.path)
                 Archive.objects.add_or_update_from_values({'crc32': crc32}, pk=archive.pk)
                 if self.web_queue and archive.gallery:
                     temp_settings = Settings(load_from_config=self.settings.config)
                     temp_settings.allow_downloaders_only(['panda_archive'], True, True, True)
                     self.web_queue.enqueue_args_list((archive.gallery.get_link(),), override_options=temp_settings)
                     return
             else:
                 self.logger.warning(
                     "For archive: {}, File check on downloaded zipfile: {}. "
                     "Check the file manually.".format(archive, archive.zipped.path)
                 )
         crc32 = calc_crc32(
             archive.zipped.path)
         filesize = get_zip_filesize(
             archive.zipped.path)
         filecount = filecount_in_zip(
             archive.zipped.path)
         values = {'crc32': crc32,
                   'filesize': filesize,
                   'filecount': filecount,
                   }
         updated_archive = Archive.objects.add_or_update_from_values(
             values, pk=archive.pk)
         if archive.gallery and updated_archive.filesize != updated_archive.gallery.filesize:
             if Archive.objects.filter(gallery=updated_archive.gallery, filesize=updated_archive.gallery.filesize):
                 self.logger.info(
                     "For archive: {} size does not match gallery, "
                     "but there's already another archive that matches.".format(updated_archive)
                 )
                 return
             if 'panda' in archive.source_type:
                 self.logger.info(
                     "For archive: {} size does not match gallery, "
                     "downloading again from panda_archive.".format(updated_archive)
                 )
                 if self.web_queue:
                     temp_settings = Settings(load_from_config=self.settings.config)
                     temp_settings.allow_downloaders_only(['panda_archive'], True, True, True)
                     self.web_queue.enqueue_args_list(
                         (updated_archive.gallery.get_link(), ),
                         override_options=temp_settings
                     )
             else:
                 self.logger.warning(
                     "For archive: {} size does not match gallery. Check the file manually.".format(archive)
                 )
Beispiel #3
0
    def start_crawling(self, arg_line: List[str]) -> None:

        args = self.get_args(arg_line)

        if isinstance(args, ArgumentParserError):
            self.logger.info(str(args))
            return

        files = []
        do_not_replace = False
        values: DataDict = {}

        if args.remove_missing_files:
            found_archives = Archive.objects.all()

            if found_archives:
                self.logger.info("Checking {} archives for existence in filesystem".format(found_archives.count()))
                for archive in found_archives:
                    if not os.path.isfile(archive.zipped.path):
                        Archive.objects.delete_by_filter(
                            pk=archive.pk)

            return
        elif args.display_missing_files:

            found_archives = Archive.objects.all()

            if found_archives:
                self.logger.info("Checking {} archives for existence in filesystem".format(found_archives.count()))
                for archive in found_archives:
                    if not os.path.isfile(archive.zipped.path):
                        self.logger.info("Filename: {} doesn't exist".format(archive.zipped.path))
            return
        elif args.rematch_non_matches:
            self.settings.rematch_file_list = ['non-match']
            self.settings.rematch_file = True
            found_archives = Archive.objects.filter(
                match_type='non-match')
            if found_archives:
                self.logger.info("Scanning {} archives with non-matches".format(found_archives.count()))
                for archive in found_archives:
                    if os.path.isfile(archive.zipped.path):
                        files.append(archive.zipped.path)

        elif args.rematch_by_match_type:
            self.settings.rematch_file_list = [args.rematch_by_match_type]
            self.settings.rematch_file = True
            self.settings.replace_metadata = True
            found_archives = Archive.objects.filter(
                match_type=args.rematch_by_match_type)
            if found_archives:
                self.logger.info("Scanning {} archives matched by {}".format(
                    found_archives.count(), args.rematch_by_match_type
                ))
                for archive in found_archives:
                    if os.path.isfile(archive.zipped.path):
                        files.append(archive.zipped.path)
        elif args.rematch_wrong_filesize:
            self.settings.rematch_file = True
            self.settings.replace_metadata = True
            do_not_replace = True
            found_archives = Archive.objects.exclude(
                match_type='non-match', gallery_id__isnull=True)
            if found_archives:
                for archive in found_archives:
                    if not os.path.isfile(archive.zipped.path):
                        continue
                    if archive.filesize == archive.gallery.filesize:
                        continue
                    files.append(archive.zipped.path)
                self.logger.info("Scanning {} archives matched with wrong filesize".format(len(files)))
        elif args.recalc_missing_crc32:

            found_archives = Archive.objects.filter(crc32='')

            if found_archives:
                self.logger.info("Calculating {} archives with missing CRC32".format(found_archives.count()))
                for cnt, archive in enumerate(found_archives):
                    if os.path.isfile(archive.zipped.path):
                        crc32 = calc_crc32(
                            archive.zipped.path)
                        self.logger.info("Working on archive {} of {}, CRC32: {}".format((cnt + 1), found_archives.count(), crc32))
                        values = {'crc32': crc32}
                        Archive.objects.add_or_update_from_values(
                            values, pk=archive.pk)
                    else:
                        self.logger.info("Archive {} of {}, path: {} does not exist".format(
                            (cnt + 1),
                            found_archives.count(),
                            archive.zipped.path
                        ))
            return
        elif args.all_filenames_to_title:

            archives_title_gid = Archive.objects.exclude(
                title='')

            if archives_title_gid:
                self.logger.info("Checking {} galleries".format(archives_title_gid.count()))
                for cnt, archive in enumerate(archives_title_gid):
                    current_path = os.path.join(os.path.dirname(
                        archive.zipped.path), replace_illegal_name(archive.title) + '.zip')

                    if archive.zipped.path != current_path and not os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, current_path)):
                        self.logger.info("Filename should be {} but it's {}".format(current_path, archive.zipped.path))
                        if args.filename_to_title == 'rename':
                            os.rename(archive.zipped.path, os.path.join(
                                self.settings.MEDIA_ROOT, current_path))
                            values = {'zipped': current_path,
                                      }
                            Archive.objects.add_or_update_from_values(
                                values, pk=archive.pk)
            return

        elif args.rematch_from_internal_gallery_titles:

            non_matched_archives = Archive.objects.filter(
                match_type='non-match')

            if non_matched_archives:

                archives_title_gid, galleries_title_gid = self.get_archive_and_gallery_titles()

                self.logger.info("Matching against archive and gallery database, {} archives with no match".format(non_matched_archives.count()))
                for archive in non_matched_archives:
                    adjusted_title = replace_illegal_name(
                        os.path.basename(archive.zipped.path)).replace(".zip", "")

                    galleries_id_token = get_closer_gallery_title_from_list(
                        adjusted_title, galleries_title_gid, args.rematch_from_internal_gallery_titles)
                    if galleries_id_token is not None:
                        self.logger.info("Path: {}\nGal title: {}".format(adjusted_title, galleries_id_token[0]))
                        values = {
                            'title': Gallery.objects.filter(id=galleries_id_token[1])[0].title,
                            'title_jpn': Gallery.objects.filter(id=galleries_id_token[1])[0].title_jpn,
                            'zipped': archive.zipped.path,
                            'crc32': archive.crc32,
                            'match_type': 'gallery_database',
                            'filesize': archive.filesize,
                            'filecount': archive.filecount,
                            'gallery_id': galleries_id_token[1]
                        }
                        Archive.objects.add_or_update_from_values(
                            values, pk=archive.pk)
                        Gallery.objects.update_by_dl_type(
                            {"dl_type": "folder:filename"}, galleries_id_token[1], "failed")
                    else:
                        galleries_id_token = get_closer_gallery_title_from_list(
                            adjusted_title, archives_title_gid, args.rematch_from_internal_gallery_titles)
                        if galleries_id_token is not None:
                            self.logger.info("Path: {}\nMatch title: {}".format(adjusted_title, galleries_id_token[0]))
                            values = {
                                'title': Gallery.objects.filter(id=galleries_id_token[1])[0].title,
                                'title_jpn': Gallery.objects.filter(id=galleries_id_token[1])[0].title_jpn,
                                'zipped': archive.zipped.path,
                                'crc32': archive.crc32,
                                'match_type': archive.match_type,
                                'filesize': archive.filesize,
                                'filecount': archive.filecount,
                                'gallery_id': galleries_id_token[1]
                            }
                            Archive.objects.add_or_update_from_values(
                                values, pk=archive.pk)

            return

        elif args.display_match_from_internal_gallery_titles:

            non_matched_archives = Archive.objects.filter(
                match_type='non-match')

            if non_matched_archives:

                archives_title_gid, galleries_title_gid = self.get_archive_and_gallery_titles()

                self.logger.info("Matching against archive and gallery database, {} archives with no match".format(non_matched_archives.count()))
                for archive in non_matched_archives:
                    adjusted_title = replace_illegal_name(
                        os.path.basename(archive.zipped.path)).replace(".zip", "")
                    galleries_id_token = get_closer_gallery_title_from_list(
                        adjusted_title, galleries_title_gid, args.display_match_from_internal_gallery_titles)
                    if galleries_id_token is not None:
                        self.logger.info("Path: {}\nGal title: {}".format(adjusted_title, galleries_id_token[0]))
                    else:
                        galleries_id_token = get_closer_gallery_title_from_list(
                            adjusted_title, archives_title_gid, args.display_match_from_internal_gallery_titles)
                        if galleries_id_token is not None:
                            self.logger.info("Path: {}\nMatch title: {}".format(adjusted_title, galleries_id_token[0]))

            return
        else:
            for folder in args.folder:
                p = os.path.normpath(os.path.join(self.settings.MEDIA_ROOT, folder))
                if not p.startswith(self.settings.MEDIA_ROOT):
                    continue
                folder = os.path.relpath(p, self.settings.MEDIA_ROOT).replace("\\", "/")
                if os.path.isdir(os.path.join(self.settings.MEDIA_ROOT, folder)):
                    for root, _, filenames in os.walk(os.path.join(self.settings.MEDIA_ROOT, str(folder))):
                        for filename in fnmatch.filter(filenames, self.settings.filename_filter):
                            files.append(
                                os.path.relpath(os.path.join(root, filename), self.settings.MEDIA_ROOT))
                elif os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, folder)):
                    files.append(folder)

        if args.rename_to_title:
            self.logger.info("Checking {} galleries".format(len(files)))
            for cnt, filepath in enumerate(files):

                archive = Archive.objects.filter(zipped=filepath).first()

                if archive:
                    current_path = os.path.join(
                        os.path.dirname(filepath), replace_illegal_name(archive.title) + '.zip')

                    if filepath != current_path and not os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, current_path)):
                        self.logger.info("Filename should be {} but it's {}".format(current_path, filepath))
                        if args.rename_to_title == 'rename':
                            os.rename(os.path.join(self.settings.MEDIA_ROOT, filepath), os.path.join(
                                self.settings.MEDIA_ROOT, current_path))
                            values = {'zipped': current_path,
                                      }
                            Archive.objects.add_or_update_from_values(
                                values, zipped=filepath)
            return

        if args.set_reason:
            self.settings.archive_reason = args.set_reason

        if args.set_source:
            self.settings.archive_source = args.set_source

        # The creation of the files list ends here. From here onwards, it's processing them.

        if len(files) == 0:
            self.logger.info("No file matching needed, skipping matchers")
        else:
            self.logger.info("Starting checks for {} archives".format(len(files)))

            matchers_list = self.settings.provider_context.get_matchers(self.settings, logger=self.logger)
            for matcher in matchers_list:
                self.logger.info("Using matcher {} with a priority of {}".format(matcher[0].name, matcher[1]))

            for cnt, filepath in enumerate(files):

                self.logger.info("Checking file: {} of {}, path: {}".format((cnt + 1), len(files), filepath))

                title = re.sub(
                    '[_]', ' ', os.path.splitext(os.path.basename(filepath))[0])
                archive = Archive.objects.filter(zipped=filepath).first()
                if not self.settings.rehash_files and archive:
                    crc32 = archive.crc32
                else:
                    crc32 = calc_crc32(
                        os.path.join(self.settings.MEDIA_ROOT, filepath))

                if archive:
                    if args.force_rematch:
                        self.logger.info("Doing a forced rematch")
                    elif archive.match_type in self.settings.rematch_file_list or args.rematch_wrong_filesize:
                        if self.settings.rematch_file:
                            self.logger.info("File was already matched before, but rematch is ordered")
                        else:
                            self.logger.info("File was already matched before, not rematching")
                            continue
                    else:
                        self.logger.info("Match already saved, skipping")
                        continue
                else:
                    # Test for corrupt files
                    except_at_open = False
                    return_error = None
                    try:
                        my_zip = ZipFile(
                            os.path.join(self.settings.MEDIA_ROOT, filepath), 'r')
                        return_error = my_zip.testzip()
                        my_zip.close()
                    except (BadZipFile, NotImplementedError):
                        except_at_open = True
                    if except_at_open or return_error:
                        self.logger.warning("File check on zipfile failed on file: {}, marking as corrupt.".format(filepath))
                        values = {
                            'title': title,
                            'title_jpn': '',
                            'zipped': filepath,
                            'crc32': crc32,
                            'match_type': 'corrupt',
                            'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                            'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                            'source_type': 'folder'
                        }
                        if self.settings.archive_reason:
                            values.update({'reason': self.settings.archive_reason})
                        if self.settings.archive_details:
                            values.update({'details': self.settings.archive_details})
                        if self.settings.archive_source:
                            values.update({'source_type': self.settings.archive_source})
                        Archive.objects.update_or_create_by_values_and_gid(
                            values, None, zipped=filepath)
                        continue

                    # Look for previous matches
                    archive = Archive.objects.filter(crc32=crc32).first()
                    if archive:
                        if self.settings.copy_match_file:
                            self.logger.info("Found previous match by CRC32, copying its values")
                            values = {
                                'title': archive.title,
                                'title_jpn': archive.title_jpn,
                                'zipped': filepath,
                                'crc32': crc32,
                                'match_type': archive.match_type,
                                'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                                'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                                'gallery_id': archive.gallery_id,
                                'source_type': archive.source_type
                            }
                            if self.settings.archive_reason:
                                values.update({'reason': self.settings.archive_reason})
                            if self.settings.archive_details:
                                values.update({'details': self.settings.archive_details})
                            if self.settings.archive_source:
                                values.update({'source_type': self.settings.archive_source})
                            Archive.objects.add_or_update_from_values(
                                values, zipped=filepath)
                            continue
                        else:
                            self.logger.info("Matching independently and ignoring previous match")

                match_result = False

                start_time = time.perf_counter()

                match_type = ''
                match_title = ''
                match_link = ''
                match_count = 0

                for i, matcher in enumerate(matchers_list):
                    if i > 0:
                        time.sleep(self.settings.wait_timer)
                    self.logger.info("Matching with: {}".format(matcher[0]))
                    if matcher[0].start_match(filepath, crc32):
                        match_type = matcher[0].found_by
                        match_title = matcher[0].match_title or ''
                        match_link = matcher[0].match_link or ''
                        match_count = matcher[0].match_count
                        match_result = True
                        break

                end_time = time.perf_counter()

                self.logger.info("Time taken to match file {}: {:.2f} seconds.".format(filepath, (end_time - start_time)))

                if not match_result and not do_not_replace:
                    self.logger.info('Could not match with any matcher, adding as non-match.')

                    values = {
                        'title': title,
                        'title_jpn': '',
                        'zipped': filepath,
                        'crc32': crc32,
                        'match_type': 'non-match',
                        'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                        'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                        'source_type': 'folder'
                    }
                    if self.settings.archive_reason:
                        values.update({'reason': self.settings.archive_reason})
                    if self.settings.archive_details:
                        values.update({'details': self.settings.archive_details})
                    if self.settings.archive_source:
                        values.update({'source_type': self.settings.archive_source})
                    archive = Archive.objects.update_or_create_by_values_and_gid(
                        values, None, zipped=filepath)

                    if self.settings.internal_matches_for_non_matches:
                        self.logger.info('Generating possible internal matches.')

                        archive.generate_possible_matches(cutoff=0.4, clear_title=True)
                        self.logger.info('Generated matches for {}, found {}'.format(
                            archive.zipped.path,
                            archive.possible_matches.count()
                        ))
                elif match_result:
                    result_message = (
                        "Matched title: {}\n"
                        "Matched link: {}\n"
                        "Matched type: {}\n"
                        "Match count: {}\n".format(match_title, match_link, match_type, match_count)
                    )
                    self.logger.info(result_message)

        self.logger.info('Folder crawler done.')