def format_match_values(self) -> Optional[DataDict]: if not self.match_values: return None self.match_gid = self.match_values.gid values = { 'title': self.match_title, 'title_jpn': '', 'zipped': self.file_path, 'crc32': self.crc32, 'match_type': self.found_by, 'filesize': get_zip_filesize( os.path.join(self.settings.MEDIA_ROOT, self.file_path)), 'filecount': filecount_in_zip( os.path.join(self.settings.MEDIA_ROOT, self.file_path)), 'source_type': self.provider } return values
def process_downloaded_archive(self, archive: Archive) -> None: if os.path.isfile(archive.zipped.path): except_at_open = False return_error = None try: my_zip = ZipFile( archive.zipped.path, 'r') return_error = my_zip.testzip() my_zip.close() except (BadZipFile, NotImplementedError): except_at_open = True if except_at_open or return_error: if 'panda' in archive.source_type: self.logger.error( "For archive: {}, file check on downloaded zipfile failed on file: {}, " "forcing download as panda_archive to fix it.".format(archive, archive.zipped.path) ) crc32 = calc_crc32( archive.zipped.path) Archive.objects.add_or_update_from_values({'crc32': crc32}, pk=archive.pk) if self.web_queue and archive.gallery: temp_settings = Settings(load_from_config=self.settings.config) temp_settings.allow_downloaders_only(['panda_archive'], True, True, True) self.web_queue.enqueue_args_list((archive.gallery.get_link(),), override_options=temp_settings) return else: self.logger.warning( "For archive: {}, File check on downloaded zipfile: {}. " "Check the file manually.".format(archive, archive.zipped.path) ) crc32 = calc_crc32( archive.zipped.path) filesize = get_zip_filesize( archive.zipped.path) filecount = filecount_in_zip( archive.zipped.path) values = {'crc32': crc32, 'filesize': filesize, 'filecount': filecount, } updated_archive = Archive.objects.add_or_update_from_values( values, pk=archive.pk) if archive.gallery and updated_archive.filesize != updated_archive.gallery.filesize: if Archive.objects.filter(gallery=updated_archive.gallery, filesize=updated_archive.gallery.filesize): self.logger.info( "For archive: {} size does not match gallery, " "but there's already another archive that matches.".format(updated_archive) ) return if 'panda' in archive.source_type: self.logger.info( "For archive: {} size does not match gallery, " "downloading again from panda_archive.".format(updated_archive) ) if self.web_queue: temp_settings = Settings(load_from_config=self.settings.config) temp_settings.allow_downloaders_only(['panda_archive'], True, True, True) self.web_queue.enqueue_args_list( (updated_archive.gallery.get_link(), ), override_options=temp_settings ) else: self.logger.warning( "For archive: {} size does not match gallery. Check the file manually.".format(archive) )
def start_crawling(self, arg_line: List[str]) -> None: args = self.get_args(arg_line) if isinstance(args, ArgumentParserError): self.logger.info(str(args)) return files = [] do_not_replace = False values: DataDict = {} if args.remove_missing_files: found_archives = Archive.objects.all() if found_archives: self.logger.info("Checking {} archives for existence in filesystem".format(found_archives.count())) for archive in found_archives: if not os.path.isfile(archive.zipped.path): Archive.objects.delete_by_filter( pk=archive.pk) return elif args.display_missing_files: found_archives = Archive.objects.all() if found_archives: self.logger.info("Checking {} archives for existence in filesystem".format(found_archives.count())) for archive in found_archives: if not os.path.isfile(archive.zipped.path): self.logger.info("Filename: {} doesn't exist".format(archive.zipped.path)) return elif args.rematch_non_matches: self.settings.rematch_file_list = ['non-match'] self.settings.rematch_file = True found_archives = Archive.objects.filter( match_type='non-match') if found_archives: self.logger.info("Scanning {} archives with non-matches".format(found_archives.count())) for archive in found_archives: if os.path.isfile(archive.zipped.path): files.append(archive.zipped.path) elif args.rematch_by_match_type: self.settings.rematch_file_list = [args.rematch_by_match_type] self.settings.rematch_file = True self.settings.replace_metadata = True found_archives = Archive.objects.filter( match_type=args.rematch_by_match_type) if found_archives: self.logger.info("Scanning {} archives matched by {}".format( found_archives.count(), args.rematch_by_match_type )) for archive in found_archives: if os.path.isfile(archive.zipped.path): files.append(archive.zipped.path) elif args.rematch_wrong_filesize: self.settings.rematch_file = True self.settings.replace_metadata = True do_not_replace = True found_archives = Archive.objects.exclude( match_type='non-match', gallery_id__isnull=True) if found_archives: for archive in found_archives: if not os.path.isfile(archive.zipped.path): continue if archive.filesize == archive.gallery.filesize: continue files.append(archive.zipped.path) self.logger.info("Scanning {} archives matched with wrong filesize".format(len(files))) elif args.recalc_missing_crc32: found_archives = Archive.objects.filter(crc32='') if found_archives: self.logger.info("Calculating {} archives with missing CRC32".format(found_archives.count())) for cnt, archive in enumerate(found_archives): if os.path.isfile(archive.zipped.path): crc32 = calc_crc32( archive.zipped.path) self.logger.info("Working on archive {} of {}, CRC32: {}".format((cnt + 1), found_archives.count(), crc32)) values = {'crc32': crc32} Archive.objects.add_or_update_from_values( values, pk=archive.pk) else: self.logger.info("Archive {} of {}, path: {} does not exist".format( (cnt + 1), found_archives.count(), archive.zipped.path )) return elif args.all_filenames_to_title: archives_title_gid = Archive.objects.exclude( title='') if archives_title_gid: self.logger.info("Checking {} galleries".format(archives_title_gid.count())) for cnt, archive in enumerate(archives_title_gid): current_path = os.path.join(os.path.dirname( archive.zipped.path), replace_illegal_name(archive.title) + '.zip') if archive.zipped.path != current_path and not os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, current_path)): self.logger.info("Filename should be {} but it's {}".format(current_path, archive.zipped.path)) if args.filename_to_title == 'rename': os.rename(archive.zipped.path, os.path.join( self.settings.MEDIA_ROOT, current_path)) values = {'zipped': current_path, } Archive.objects.add_or_update_from_values( values, pk=archive.pk) return elif args.rematch_from_internal_gallery_titles: non_matched_archives = Archive.objects.filter( match_type='non-match') if non_matched_archives: archives_title_gid, galleries_title_gid = self.get_archive_and_gallery_titles() self.logger.info("Matching against archive and gallery database, {} archives with no match".format(non_matched_archives.count())) for archive in non_matched_archives: adjusted_title = replace_illegal_name( os.path.basename(archive.zipped.path)).replace(".zip", "") galleries_id_token = get_closer_gallery_title_from_list( adjusted_title, galleries_title_gid, args.rematch_from_internal_gallery_titles) if galleries_id_token is not None: self.logger.info("Path: {}\nGal title: {}".format(adjusted_title, galleries_id_token[0])) values = { 'title': Gallery.objects.filter(id=galleries_id_token[1])[0].title, 'title_jpn': Gallery.objects.filter(id=galleries_id_token[1])[0].title_jpn, 'zipped': archive.zipped.path, 'crc32': archive.crc32, 'match_type': 'gallery_database', 'filesize': archive.filesize, 'filecount': archive.filecount, 'gallery_id': galleries_id_token[1] } Archive.objects.add_or_update_from_values( values, pk=archive.pk) Gallery.objects.update_by_dl_type( {"dl_type": "folder:filename"}, galleries_id_token[1], "failed") else: galleries_id_token = get_closer_gallery_title_from_list( adjusted_title, archives_title_gid, args.rematch_from_internal_gallery_titles) if galleries_id_token is not None: self.logger.info("Path: {}\nMatch title: {}".format(adjusted_title, galleries_id_token[0])) values = { 'title': Gallery.objects.filter(id=galleries_id_token[1])[0].title, 'title_jpn': Gallery.objects.filter(id=galleries_id_token[1])[0].title_jpn, 'zipped': archive.zipped.path, 'crc32': archive.crc32, 'match_type': archive.match_type, 'filesize': archive.filesize, 'filecount': archive.filecount, 'gallery_id': galleries_id_token[1] } Archive.objects.add_or_update_from_values( values, pk=archive.pk) return elif args.display_match_from_internal_gallery_titles: non_matched_archives = Archive.objects.filter( match_type='non-match') if non_matched_archives: archives_title_gid, galleries_title_gid = self.get_archive_and_gallery_titles() self.logger.info("Matching against archive and gallery database, {} archives with no match".format(non_matched_archives.count())) for archive in non_matched_archives: adjusted_title = replace_illegal_name( os.path.basename(archive.zipped.path)).replace(".zip", "") galleries_id_token = get_closer_gallery_title_from_list( adjusted_title, galleries_title_gid, args.display_match_from_internal_gallery_titles) if galleries_id_token is not None: self.logger.info("Path: {}\nGal title: {}".format(adjusted_title, galleries_id_token[0])) else: galleries_id_token = get_closer_gallery_title_from_list( adjusted_title, archives_title_gid, args.display_match_from_internal_gallery_titles) if galleries_id_token is not None: self.logger.info("Path: {}\nMatch title: {}".format(adjusted_title, galleries_id_token[0])) return else: for folder in args.folder: p = os.path.normpath(os.path.join(self.settings.MEDIA_ROOT, folder)) if not p.startswith(self.settings.MEDIA_ROOT): continue folder = os.path.relpath(p, self.settings.MEDIA_ROOT).replace("\\", "/") if os.path.isdir(os.path.join(self.settings.MEDIA_ROOT, folder)): for root, _, filenames in os.walk(os.path.join(self.settings.MEDIA_ROOT, str(folder))): for filename in fnmatch.filter(filenames, self.settings.filename_filter): files.append( os.path.relpath(os.path.join(root, filename), self.settings.MEDIA_ROOT)) elif os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, folder)): files.append(folder) if args.rename_to_title: self.logger.info("Checking {} galleries".format(len(files))) for cnt, filepath in enumerate(files): archive = Archive.objects.filter(zipped=filepath).first() if archive: current_path = os.path.join( os.path.dirname(filepath), replace_illegal_name(archive.title) + '.zip') if filepath != current_path and not os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, current_path)): self.logger.info("Filename should be {} but it's {}".format(current_path, filepath)) if args.rename_to_title == 'rename': os.rename(os.path.join(self.settings.MEDIA_ROOT, filepath), os.path.join( self.settings.MEDIA_ROOT, current_path)) values = {'zipped': current_path, } Archive.objects.add_or_update_from_values( values, zipped=filepath) return if args.set_reason: self.settings.archive_reason = args.set_reason if args.set_source: self.settings.archive_source = args.set_source # The creation of the files list ends here. From here onwards, it's processing them. if len(files) == 0: self.logger.info("No file matching needed, skipping matchers") else: self.logger.info("Starting checks for {} archives".format(len(files))) matchers_list = self.settings.provider_context.get_matchers(self.settings, logger=self.logger) for matcher in matchers_list: self.logger.info("Using matcher {} with a priority of {}".format(matcher[0].name, matcher[1])) for cnt, filepath in enumerate(files): self.logger.info("Checking file: {} of {}, path: {}".format((cnt + 1), len(files), filepath)) title = re.sub( '[_]', ' ', os.path.splitext(os.path.basename(filepath))[0]) archive = Archive.objects.filter(zipped=filepath).first() if not self.settings.rehash_files and archive: crc32 = archive.crc32 else: crc32 = calc_crc32( os.path.join(self.settings.MEDIA_ROOT, filepath)) if archive: if args.force_rematch: self.logger.info("Doing a forced rematch") elif archive.match_type in self.settings.rematch_file_list or args.rematch_wrong_filesize: if self.settings.rematch_file: self.logger.info("File was already matched before, but rematch is ordered") else: self.logger.info("File was already matched before, not rematching") continue else: self.logger.info("Match already saved, skipping") continue else: # Test for corrupt files except_at_open = False return_error = None try: my_zip = ZipFile( os.path.join(self.settings.MEDIA_ROOT, filepath), 'r') return_error = my_zip.testzip() my_zip.close() except (BadZipFile, NotImplementedError): except_at_open = True if except_at_open or return_error: self.logger.warning("File check on zipfile failed on file: {}, marking as corrupt.".format(filepath)) values = { 'title': title, 'title_jpn': '', 'zipped': filepath, 'crc32': crc32, 'match_type': 'corrupt', 'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'source_type': 'folder' } if self.settings.archive_reason: values.update({'reason': self.settings.archive_reason}) if self.settings.archive_details: values.update({'details': self.settings.archive_details}) if self.settings.archive_source: values.update({'source_type': self.settings.archive_source}) Archive.objects.update_or_create_by_values_and_gid( values, None, zipped=filepath) continue # Look for previous matches archive = Archive.objects.filter(crc32=crc32).first() if archive: if self.settings.copy_match_file: self.logger.info("Found previous match by CRC32, copying its values") values = { 'title': archive.title, 'title_jpn': archive.title_jpn, 'zipped': filepath, 'crc32': crc32, 'match_type': archive.match_type, 'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'gallery_id': archive.gallery_id, 'source_type': archive.source_type } if self.settings.archive_reason: values.update({'reason': self.settings.archive_reason}) if self.settings.archive_details: values.update({'details': self.settings.archive_details}) if self.settings.archive_source: values.update({'source_type': self.settings.archive_source}) Archive.objects.add_or_update_from_values( values, zipped=filepath) continue else: self.logger.info("Matching independently and ignoring previous match") match_result = False start_time = time.perf_counter() match_type = '' match_title = '' match_link = '' match_count = 0 for i, matcher in enumerate(matchers_list): if i > 0: time.sleep(self.settings.wait_timer) self.logger.info("Matching with: {}".format(matcher[0])) if matcher[0].start_match(filepath, crc32): match_type = matcher[0].found_by match_title = matcher[0].match_title or '' match_link = matcher[0].match_link or '' match_count = matcher[0].match_count match_result = True break end_time = time.perf_counter() self.logger.info("Time taken to match file {}: {:.2f} seconds.".format(filepath, (end_time - start_time))) if not match_result and not do_not_replace: self.logger.info('Could not match with any matcher, adding as non-match.') values = { 'title': title, 'title_jpn': '', 'zipped': filepath, 'crc32': crc32, 'match_type': 'non-match', 'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'source_type': 'folder' } if self.settings.archive_reason: values.update({'reason': self.settings.archive_reason}) if self.settings.archive_details: values.update({'details': self.settings.archive_details}) if self.settings.archive_source: values.update({'source_type': self.settings.archive_source}) archive = Archive.objects.update_or_create_by_values_and_gid( values, None, zipped=filepath) if self.settings.internal_matches_for_non_matches: self.logger.info('Generating possible internal matches.') archive.generate_possible_matches(cutoff=0.4, clear_title=True) self.logger.info('Generated matches for {}, found {}'.format( archive.zipped.path, archive.possible_matches.count() )) elif match_result: result_message = ( "Matched title: {}\n" "Matched link: {}\n" "Matched type: {}\n" "Match count: {}\n".format(match_title, match_link, match_type, match_count) ) self.logger.info(result_message) self.logger.info('Folder crawler done.')