Ejemplo n.º 1
0
 def get_archive_and_gallery_titles() -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
     found_galleries = Gallery.objects.eligible_for_use()
     found_archives = Archive.objects.exclude(
         match_type__in=('', 'non-match'))
     archives_title_gid = []
     galleries_title_gid = []
     for archive in found_archives:
         archives_title_gid.append(
             (replace_illegal_name(archive.title), archive.gallery_id))
     for gallery in found_galleries:
         if 'replaced' in gallery.tag_list():
             continue
         galleries_title_gid.append(
             (replace_illegal_name(gallery.title), gallery.id))
     return archives_title_gid, galleries_title_gid
Ejemplo n.º 2
0
    def start_download(self) -> None:

        if not self.gallery:
            return

        if not (self.gallery.root and self.gallery.gid and self.gallery.token
                and self.gallery.archiver_key):
            logger.error(
                'Missing required data -> root: {}, gid: {}, token: {}, archiver_key: {}.'
                .format(
                    self.gallery.root,
                    self.gallery.gid,
                    self.gallery.token,
                    self.gallery.archiver_key,
                ))
            self.return_code = 0
            return

        r = self.request_hath_download(self.gallery.root, self.gallery.gid,
                                       self.gallery.token,
                                       self.gallery.archiver_key)

        if r and r.status_code == 200:

            r.encoding = 'utf-8'
            soup = BeautifulSoup(r.content, 'html.parser')

            container = soup.find(text=re.compile(
                'An original resolution download has been queued for client'))

            if not container:
                logger.error("Could not find expected text in response.")
                self.return_code = 0
                return
            client_id = container.parent.find('strong')
            if client_id:
                logger.info("Queued download to client: {}".format(
                    client_id.get_text()))

            to_use_filename = get_base_filename_string_from_gallery_data(
                self.gallery)

            self.gallery.filename = available_filename(
                self.settings.MEDIA_ROOT,
                os.path.join(
                    self.own_settings.hath_dl_folder,
                    replace_illegal_name(to_use_filename + " [" +
                                         str(self.gallery.gid) + "]") +
                    '.zip'))

            self.fileDownloaded = 1
            self.return_code = 1
        else:
            if r:
                logger.error('Did not get a 200 response, text: {}'.format(
                    r.text))
            else:
                logger.error('Did not get a response')
            self.return_code = 0
Ejemplo n.º 3
0
    def connect_and_download(self, client: TorrentClient,
                             torrent_link: str) -> None:
        if not self.gallery:
            return None
        client.connect()
        if client.send_url:
            result = client.add_url(
                torrent_link,
                download_dir=self.settings.torrent['download_dir'])
        else:
            result = client.add_torrent(
                self.general_utils.get_torrent(
                    torrent_link,
                    self.own_settings.cookies,
                    convert_to_base64=client.convert_to_base64),
                download_dir=self.settings.torrent['download_dir'])
        if result:
            if client.expected_torrent_name:
                self.expected_torrent_name = "{} [{}]".format(
                    client.expected_torrent_name, self.gallery.gid)
            else:

                to_use_filename = get_base_filename_string_from_gallery_data(
                    self.gallery)

                self.expected_torrent_name = "{} [{}]".format(
                    replace_illegal_name(to_use_filename), self.gallery.gid)
            if client.expected_torrent_extension:
                self.expected_torrent_extension = client.expected_torrent_extension
            else:
                self.expected_torrent_extension = ".zip"

            self.fileDownloaded = 1
            self.return_code = 1
            if client.total_size > 0:
                self.gallery.filesize = client.total_size
            self.gallery.filename = os.path.join(
                self.own_settings.torrent_dl_folder,
                replace_illegal_name(self.expected_torrent_name) +
                self.expected_torrent_extension)
            logger.info(
                "Torrent added, expecting downloaded name: {}, local name: {}".
                format(self.expected_torrent_name, self.gallery.filename))
        else:
            self.return_code = 0
            logger.error("There was an error adding the torrent to the client")
Ejemplo n.º 4
0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.link:
            return

        logger.info(
            "Downloading an archive from a generic HTTP server: {}".format(
                self.gallery.link))

        request_dict = construct_request_dict(self.settings, self.own_settings)

        request_file = requests.get(self.gallery.link,
                                    stream='True',
                                    **request_dict)

        filename = get_filename_from_cd(
            request_file.headers.get('content-disposition'))

        if not filename:
            if self.gallery.link.find('/'):
                filename = self.gallery.link.rsplit('/', 1)[1]

        if not filename:
            logger.error("Could not find a filename for link: {}".format(
                self.gallery.link))
            self.return_code = 0

        self.gallery.title = filename.replace(".zip", "")
        self.gallery.filename = replace_illegal_name(
            available_filename(
                self.settings.MEDIA_ROOT,
                os.path.join(self.own_settings.archive_dl_folder, filename)))

        filepath = os.path.join(self.settings.MEDIA_ROOT,
                                self.gallery.filename)
        with open(filepath, 'wb') as fo:
            for chunk in request_file.iter_content(4096):
                fo.write(chunk)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
            filepath)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(filepath)

            self.fileDownloaded = 1
            self.return_code = 1

        else:
            logger.error("Could not download archive")
            self.return_code = 0
Ejemplo n.º 5
0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.temp_archive:
            return

        logger.info(
            "Downloading an archive: {} from a Panda Backup-like source: {}".
            format(self.gallery.title, self.gallery.temp_archive['link']))

        to_use_filename = get_base_filename_string_from_gallery_data(
            self.gallery)

        to_use_filename = replace_illegal_name(to_use_filename)

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(self.own_settings.archive_dl_folder,
                         to_use_filename + '.zip'))  # TODO: File could be cbz.

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['stream'] = True
        request_file = request_with_retries(
            self.gallery.temp_archive['link'],
            request_dict,
        )
        if not request_file:
            logger.error("Could not download archive")
            self.return_code = 0
            return
        filepath = os.path.join(self.settings.MEDIA_ROOT,
                                self.gallery.filename)

        with open(filepath, 'wb') as fo:
            for chunk in request_file.iter_content(4096):
                fo.write(chunk)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
            filepath)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(filepath)

            self.fileDownloaded = 1
            self.return_code = 1

        else:
            logger.error("Could not download archive")
            self.return_code = 0
Ejemplo n.º 6
0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.link or not self.gallery.archiver_key:
            return

        to_use_filename = get_base_filename_string_from_gallery_data(
            self.gallery)

        to_use_filename = replace_illegal_name(to_use_filename)

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(self.own_settings.archive_dl_folder,
                         to_use_filename + '.zip'))

        request_dict = construct_request_dict(self.settings, self.own_settings)

        request_file = requests.get(self.gallery.archiver_key,
                                    stream='True',
                                    **request_dict)

        filepath = os.path.join(self.settings.MEDIA_ROOT,
                                self.gallery.filename)
        with open(filepath, 'wb') as fo:
            for chunk in request_file.iter_content(4096):
                fo.write(chunk)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
            filepath)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(filepath)

            self.fileDownloaded = 1
            self.return_code = 1

        else:
            logger.error("Could not download archive")
            os.remove(filepath)
            self.return_code = 0
Ejemplo n.º 7
0
    def start_download(self) -> None:

        if not self.gallery:
            return

        self.logger.info(
            "Downloading an archive: {} from a Panda Backup-like source: {}".
            format(self.gallery.title, self.gallery.archiver_key['link']))

        self.gallery.title = replace_illegal_name(self.gallery.title)
        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(self.own_settings.archive_dl_folder,
                         self.gallery.title + '.zip'))

        request_file = requests.get(self.gallery.archiver_key['link'],
                                    stream='True',
                                    headers=self.settings.requests_headers,
                                    timeout=self.settings.timeout_timer,
                                    cookies=self.own_settings.cookies)

        filepath = os.path.join(self.settings.MEDIA_ROOT,
                                self.gallery.filename)
        with open(filepath, 'wb') as fo:
            for chunk in request_file.iter_content(4096):
                fo.write(chunk)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
            filepath)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(filepath)

            self.fileDownloaded = 1
            self.return_code = 1

        else:
            self.logger.error("Could not download archive")
            self.return_code = 0
Ejemplo n.º 8
0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.link:
            return

        if self.settings.gallery_dl.executable_path:
            exe_path_to_use = shutil.which(
                self.settings.gallery_dl.executable_path)
        else:
            exe_path_to_use = shutil.which(
                self.settings.gallery_dl.executable_name)

        if not exe_path_to_use:
            self.return_code = 0
            logger.error("The gallery-dl executable was not found")
            return

        directory_path = mkdtemp()

        arguments = ["--zip", "--dest", "{}".format(directory_path)]

        if self.own_settings.proxy:
            arguments.append("--proxy")
            arguments.append("{}".format(self.own_settings.proxy))

        if self.settings.gallery_dl.config_file:
            arguments.append("--config")
            arguments.append("{}".format(self.settings.gallery_dl.config_file))

        if self.settings.gallery_dl.extra_arguments:
            arguments.append("{}".format(
                self.settings.gallery_dl.extra_arguments))

        arguments.append("{}".format(self.gallery.link))

        logger.info("Calling gallery-dl: {}.".format(" ".join(
            [exe_path_to_use, *arguments])))

        process_result = subprocess.run([exe_path_to_use, *arguments],
                                        stdout=subprocess.PIPE,
                                        stderr=subprocess.PIPE,
                                        universal_newlines=True)

        if process_result.stderr:
            self.return_code = 0
            logger.error(
                "An error was captured when running gallery-dl: {}".format(
                    process_result.stderr))
            return

        if process_result.returncode != 0:
            self.return_code = 0
            logger.error("Return code was not 0: {}".format(
                process_result.returncode))
            return

        # If we downloaded more than one file, get the latest one
        output_path = ''
        file_name = ''
        for (dir_path, dir_names, filenames) in os.walk(directory_path):
            for current_file in filenames:
                file_name = current_file
                output_path = os.path.join(dir_path, current_file)

        if not output_path:
            self.return_code = 0
            logger.error("The resulting download file was not found")
            return

        if not output_path or not os.path.isfile(output_path):
            self.return_code = 0
            logger.error(
                "The resulting download file was not found: {}".format(
                    file_name))
            return

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(self.own_settings.archive_dl_folder,
                         replace_illegal_name(file_name)))

        self.gallery.title = os.path.splitext(file_name)[0]

        filepath = os.path.join(self.settings.MEDIA_ROOT,
                                self.gallery.filename)

        shutil.move(output_path, filepath)
        shutil.rmtree(directory_path, ignore_errors=True)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
            filepath)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(filepath)

            self.fileDownloaded = 1
            self.return_code = 1

        else:
            logger.error("Could not download archive")
            self.return_code = 0
Ejemplo n.º 9
0
def match_internal(archives: ArchiveQuerySet,
                   providers: Iterable[str],
                   logger: OptionalLogger,
                   cutoff: float = 0.4,
                   max_matches: int = 20,
                   match_by_filesize: bool = True) -> None:

    galleries_per_provider: Dict[str, GalleryQuerySet] = {}
    galleries_title_id_per_provider: Dict[str, List[Tuple[str, str]]] = {}

    if providers:
        for provider in providers:
            galleries_per_provider[
                provider] = Gallery.objects.eligible_for_use(
                    provider__contains=provider)
    else:
        galleries_per_provider['all'] = Gallery.objects.eligible_for_use()

    for provider, galleries in galleries_per_provider.items():
        galleries_title_id_per_provider[provider] = list()
        for gallery in galleries:
            if gallery.title:
                galleries_title_id_per_provider[provider].append(
                    (replace_illegal_name(gallery.title), gallery.pk))
            if gallery.title_jpn:
                galleries_title_id_per_provider[provider].append(
                    (replace_illegal_name(gallery.title_jpn), gallery.pk))

    for i, archive in enumerate(archives, start=1):  # type: ignore

        for provider, galleries_title_id in galleries_title_id_per_provider.items(
        ):

            if provider != 'all':
                matchers = crawler_settings.provider_context.get_matchers(
                    crawler_settings,
                    logger,
                    filter_name="{}_title".format(provider),
                    force=True)
                if matchers:
                    adj_title = matchers[0][0].format_to_compare_title(
                        archive.zipped.name)
                else:
                    adj_title = get_title_from_path(archive.zipped.name)
            else:
                adj_title = get_title_from_path(archive.zipped.name)
            similar_list_provider = get_list_closer_gallery_titles_from_list(
                adj_title, galleries_title_id, cutoff, max_matches)

            if similar_list_provider is not None:

                for similar in similar_list_provider:
                    gallery = Gallery.objects.get(pk=similar[1])

                    ArchiveMatches.objects.update_or_create(
                        archive=archive,
                        gallery=gallery,
                        match_type='title',
                        match_accuracy=similar[2])

                if logger:
                    logger.info(
                        "{} of {}: Found {} matches (internal search) from title for archive: {}, using provider filter: {}"
                        .format(i, archives.count(),
                                len(similar_list_provider), archive.title,
                                provider))

        if not match_by_filesize or archive.filesize <= 0:
            continue
        galleries_same_size = Gallery.objects.filter(filesize=archive.filesize)
        if galleries_same_size.exists():

            if logger:
                logger.info(
                    "{} of {}: Found {} matches (internal search) from filesize for archive: {}"
                    .format(i, str(archives.count()),
                            str(galleries_same_size.count()), archive.title))
            for similar_gallery in galleries_same_size:
                gallery = Gallery.objects.get(pk=similar_gallery.pk)

                ArchiveMatches.objects.update_or_create(archive=archive,
                                                        gallery=gallery,
                                                        match_type='size',
                                                        match_accuracy=1)
Ejemplo n.º 10
0
def match_archives_from_gallery_titles(archives: ArchiveQuerySet,
                                       logger: OptionalLogger = None,
                                       cutoff: float = 0.4,
                                       max_matches: int = 20,
                                       provider: str = '') -> None:

    try:
        if not archives:
            non_match_archives = Archive.objects.filter(match_type='non-match')
        else:
            non_match_archives = archives

        if non_match_archives:

            galleries_title_id = []

            if provider:
                galleries = Gallery.objects.eligible_for_use(
                    provider__contains=provider)
            else:
                galleries = Gallery.objects.eligible_for_use()
            for gallery in galleries:
                if gallery.title:
                    galleries_title_id.append(
                        (replace_illegal_name(gallery.title), gallery.pk))
                if gallery.title_jpn:
                    galleries_title_id.append(
                        (replace_illegal_name(gallery.title_jpn), gallery.pk))

            if logger:
                logger.info("Trying to match against gallery database, "
                            "{} archives with no match, matching against: {}, "
                            "number of galleries: {}, cutoff: {}".format(
                                non_match_archives.count(), provider,
                                galleries.count(), cutoff))
            for i, archive in enumerate(non_match_archives, start=1):

                matchers = crawler_settings.provider_context.get_matchers(
                    crawler_settings,
                    logger,
                    filter_name="{}_title".format(provider),
                    force=True)

                if matchers:
                    adj_title = matchers[0][0].format_to_compare_title(
                        archive.zipped.name)
                else:
                    adj_title = get_title_from_path(archive.zipped.name)
                similar_list = get_list_closer_gallery_titles_from_list(
                    adj_title, galleries_title_id, cutoff, max_matches)

                if similar_list is not None:

                    archive.possible_matches.clear()

                    if logger:
                        logger.info(
                            "{} of {}: Found {} matches from title for {}".
                            format(i, non_match_archives.count(),
                                   len(similar_list), archive.zipped.name))
                    for similar in similar_list:
                        gallery = Gallery.objects.get(pk=similar[1])

                        ArchiveMatches.objects.create(
                            archive=archive,
                            gallery=gallery,
                            match_type='title',
                            match_accuracy=similar[2])

                if archive.filesize <= 0:
                    continue
                galleries_same_size = Gallery.objects.filter(
                    filesize=archive.filesize)
                if galleries_same_size.exists():

                    if logger:
                        logger.info(
                            "{} of {}: Found {} matches from filesize for {}".
                            format(i, str(non_match_archives.count()),
                                   str(galleries_same_size.count()),
                                   archive.zipped.name))
                    for similar_gallery in galleries_same_size:
                        gallery = Gallery.objects.get(pk=similar_gallery.pk)

                        ArchiveMatches.objects.create(archive=archive,
                                                      gallery=gallery,
                                                      match_type='size',
                                                      match_accuracy=1)

        if logger:
            logger.info("Matching ended")
        return
    except BaseException:
        thread_logger = logging.getLogger('viewer.threads')
        thread_logger.error(traceback.format_exc())
Ejemplo n.º 11
0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.link:
            return

        client = get_torrent_client(self.settings.torrent)
        if not client:
            self.return_code = 0
            logger.error("No torrent client was found")
            return

        torrent_link = self.get_download_link(self.gallery.link)

        logger.info("Adding torrent to client.")
        client.connect()
        if client.send_url or torrent_link.startswith('magnet:'):
            result = client.add_url(
                torrent_link,
                download_dir=self.settings.torrent['download_dir'])
        else:
            torrent_data = self.general_utils.get_torrent(
                torrent_link,
                self.own_settings.cookies,
                convert_to_base64=client.convert_to_base64)

            result = client.add_torrent(
                torrent_data,
                download_dir=self.settings.torrent['download_dir'])
            if client.expected_torrent_name == '':
                from core.libs.bencoding import Decoder
                try:
                    if client.convert_to_base64 and type(torrent_data) is str:
                        torrent_data = cast(str, torrent_data)
                        torrent_metadata = Decoder(
                            base64.decodebytes(
                                torrent_data.encode('utf-8'))).decode()
                    else:
                        torrent_data = cast(bytes, torrent_data)
                        torrent_metadata = Decoder(torrent_data).decode()
                    client.expected_torrent_name = os.path.splitext(
                        torrent_metadata[b'info'][b'name'])[0]
                    client.expected_torrent_extension = os.path.splitext(
                        torrent_metadata[b'info'][b'name'])[1]
                except (RuntimeError, EOFError):
                    self.return_code = 0
                    logger.error("Error decoding torrent data: {!r}".format(
                        torrent_data))
                    return

        if result:
            if client.expected_torrent_name:
                self.expected_torrent_name = client.expected_torrent_name
            else:
                self.expected_torrent_name = "{}".format(
                    replace_illegal_name(self.gallery.link))
            if client.expected_torrent_extension:
                self.expected_torrent_extension = client.expected_torrent_extension
            else:
                self.expected_torrent_extension = ".zip"
            self.fileDownloaded = 1
            self.return_code = 1
            if client.total_size > 0:
                self.gallery.filesize = client.total_size
            else:
                self.gallery.filesize = 0
            self.gallery.filename = available_filename(
                self.settings.MEDIA_ROOT,
                os.path.join(
                    self.own_settings.torrent_dl_folder,
                    replace_illegal_name(self.expected_torrent_name) +
                    self.expected_torrent_extension))
        else:
            self.return_code = 0
            logger.error("There was an error adding the torrent to the client")
Ejemplo n.º 12
0
    def copy_all_missing(self, mode, archives: Iterable[Archive] = None):
        files_torrent = []
        files_hath = []

        if not archives:
            found_archives: Iterable[Archive] = list(Archive.objects.filter_by_dl_remote())
        else:
            found_archives = archives

        if not found_archives:
            return

        for archive in found_archives:
            if not os.path.isfile(archive.zipped.path):
                if 'torrent' in archive.match_type:
                    files_torrent.append(archive)
                elif 'hath' in archive.match_type:
                    files_hath.append(archive)

        if len(files_torrent) + len(files_hath) == 0:
            return

        # Hath downloads
        if len(files_hath) > 0:
            files_matched_hath = []
            for matched_file in os.listdir(self.settings.providers['panda'].local_hath_folder):
                if os.path.isfile(os.path.join(self.settings.providers['panda'].local_hath_folder, matched_file)):
                    continue
                m = re.search(r'.*?\[(\d+)\]$', matched_file)
                if m:
                    for archive in files_hath:
                        if m.group(1) == archive.gallery.gid:
                            files_matched_hath.append(
                                [matched_file, archive.zipped.path, int(archive.filesize), archive])

            for img_dir in files_matched_hath:
                total_remote_size = 0
                remote_files = []
                directory = os.path.join(self.settings.providers['panda'].local_hath_folder, img_dir[0])
                for img_file in os.listdir(directory):
                    if not os.path.isfile(os.path.join(directory, img_file)) or img_file == 'galleryinfo.txt':
                        continue
                    total_remote_size += os.stat(
                        os.path.join(directory, img_file)).st_size
                    remote_files.append(
                        os.path.join(directory, img_file))
                if total_remote_size != img_dir[2]:
                    self.logger.info(
                        "For archive: {archive}, folder: {folder} "
                        "has not completed the download ({current}/{total}), skipping".format(
                            archive=img_dir[3],
                            folder=img_dir[0],
                            current=filesizeformat(total_remote_size),
                            total=filesizeformat(img_dir[2])
                        )
                    )
                    continue
                self.logger.info(
                    "For archive: {archive}, creating zip "
                    "for folder {filename}, {image_count} images".format(
                        archive=img_dir[3],
                        filename=img_dir[1],
                        image_count=len(remote_files)
                    ))
                dir_path = mkdtemp()
                for img_file_original in remote_files:
                    img_file = os.path.split(img_file_original)[1]
                    if mode == 'local_move':
                        shutil.move(img_file_original, os.path.join(dir_path, img_file))
                    else:
                        shutil.copy(img_file_original, os.path.join(dir_path, img_file))
                with ZipFile(os.path.join(self.settings.MEDIA_ROOT,
                                          img_dir[1]),
                             'w') as archive_file:
                    for (root_path, _, file_names) in os.walk(dir_path):
                        for current_file in file_names:
                            archive_file.write(
                                os.path.join(root_path, current_file), arcname=os.path.basename(current_file))
                shutil.rmtree(dir_path, ignore_errors=True)

                self.process_downloaded_archive(img_dir[3])

        # Torrent downloads
        if len(files_torrent) > 0:
            files_matched_torrent = []
            for filename in os.listdir(self.settings.torrent['download_dir']):
                for archive in files_torrent:
                    if archive.gallery:
                        cleaned_torrent_name = os.path.splitext(
                            os.path.basename(archive.zipped.path))[0].replace(' [' + archive.gallery.gid + ']', '')
                    else:
                        cleaned_torrent_name = os.path.splitext(os.path.basename(archive.zipped.path))[0]
                    if replace_illegal_name(os.path.splitext(filename)[0]) in cleaned_torrent_name:
                        files_matched_torrent.append([filename, not os.path.isfile(
                            os.path.join(self.settings.torrent['download_dir'], filename)), archive])

            for matched_file in files_matched_torrent:
                target = os.path.join(self.settings.torrent['download_dir'], matched_file[0])
                if matched_file[1]:
                    self.logger.info(
                        "For archive: {archive}, creating zip for folder: {filename}".format(
                            archive=matched_file[2],
                            filename=matched_file[0],
                        ))
                    dir_path = mkdtemp()
                    for img_file in os.listdir(target):
                        if not os.path.isfile(os.path.join(target, img_file)):
                            continue
                        if mode == 'local_move':
                            shutil.move(os.path.join(target, img_file), os.path.join(dir_path, img_file))
                        else:
                            shutil.copy(os.path.join(target, img_file), os.path.join(dir_path, img_file))

                    with ZipFile(matched_file[2].zipped.path, 'w') as archive_file:
                        for (root_path, _, file_names) in os.walk(dir_path):
                            for current_file in file_names:
                                archive_file.write(
                                    os.path.join(root_path, current_file), arcname=os.path.basename(current_file))
                    shutil.rmtree(dir_path, ignore_errors=True)
                else:
                    self.logger.info(
                        "For archive: {archive}, downloading file: {filename}".format(
                            archive=matched_file[2],
                            filename=matched_file[0],
                        ))
                    if mode == 'local_move':
                        shutil.move(target, matched_file[2].zipped.path)
                    else:
                        shutil.copy(target, matched_file[2].zipped.path)
                    if self.settings.convert_rar_to_zip and os.path.splitext(matched_file[0])[1].lower() == ".rar":
                        self.logger.info(
                            "For archive: {}, converting rar: {} to zip".format(
                                matched_file[2],
                                matched_file[2].zipped.path
                            )
                        )
                        convert_rar_to_zip(matched_file[2].zipped.path)

                self.process_downloaded_archive(matched_file[2])
Ejemplo n.º 13
0
    def download_all_missing(self, archives: Iterable[Archive] = None) -> None:

        files_torrent = []
        files_hath = []

        if not archives:
            found_archives: Iterable[Archive] = list(Archive.objects.filter_by_dl_remote())
        else:
            found_archives = archives

        if not found_archives:
            return

        for archive in found_archives:
            if 'torrent' in archive.match_type:
                files_torrent.append(archive)
            elif 'hath' in archive.match_type:
                files_hath.append(archive)

        if len(files_torrent) + len(files_hath) == 0:
            return

        self.start_connection()

        if not self.ftps:
            self.logger.error(
                "Cannot download the archives, the FTP connection is not initialized."
            )
            return None

        # Hath downloads
        if len(files_hath) > 0:
            self.set_current_dir(self.settings.providers['panda'].remote_hath_dir)
            # self.ftps.encoding = 'utf8'

            files_matched_hath = []
            for line in self.ftps.mlsd(facts=["type"]):
                if line[1]["type"] != 'dir':
                    continue
                m = re.search(r'.*?\[(\d+)\]$', line[0])
                if m:
                    for archive in files_hath:
                        if m.group(1) == archive.gallery.gid:
                            files_matched_hath.append(
                                (line[0], archive.zipped.path, int(archive.filesize), archive))

            for matched_file_hath in files_matched_hath:
                total_remote_size = 0
                remote_ftp_tuples = []
                for img_file_tuple in self.ftps.mlsd(path=matched_file_hath[0], facts=["type", "size"]):
                    if img_file_tuple[1]["type"] != 'file' or img_file_tuple[0] == 'galleryinfo.txt':
                        continue
                    total_remote_size += int(img_file_tuple[1]["size"])
                    remote_ftp_tuples.append((img_file_tuple[0], img_file_tuple[1]["size"]))
                if total_remote_size != matched_file_hath[2]:
                    self.logger.info(
                        "For archive: {archive}, remote folder: {folder} "
                        "has not completed the download ({current}/{total}), skipping".format(
                            archive=matched_file_hath[3],
                            folder=matched_file_hath[0],
                            current=filesizeformat(total_remote_size),
                            total=filesizeformat(matched_file_hath[2])
                        )
                    )
                    continue
                self.logger.info(
                    "For archive: {archive}, downloading and creating zip "
                    "for folder {filename}, {image_count} images".format(
                        archive=matched_file_hath[3],
                        filename=matched_file_hath[1],
                        image_count=len(remote_ftp_tuples)
                    ))
                dir_path = mkdtemp()
                self.current_download['total'] = len(remote_ftp_tuples)
                for count, remote_file in enumerate(sorted(remote_ftp_tuples), start=1):
                    for retry_count in range(10):
                        try:
                            with open(os.path.join(dir_path, remote_file[0]), "wb") as file:
                                self.current_download['index'] = count
                                self.write_file_update_progress(
                                    'RETR %s' % (str(matched_file_hath[0]) + "/" + remote_file[0]),
                                    file.write,
                                    int(remote_file[1])
                                )
                        except (ConnectionResetError, socket.timeout, TimeoutError):
                            self.logger.error("Hath download failed for file {} of {}, restarting connection...".format(
                                count,
                                len(remote_ftp_tuples))
                            )
                            self.ftps.close()
                            self.start_connection()
                            self.set_current_dir(self.settings.providers['panda'].remote_hath_dir)
                        else:
                            break
                with ZipFile(os.path.join(self.settings.MEDIA_ROOT,
                                          matched_file_hath[1]),
                             'w') as archive_file:
                    for (root_path, _, file_names) in os.walk(dir_path):
                        for current_file in file_names:
                            archive_file.write(
                                os.path.join(root_path, current_file), arcname=os.path.basename(current_file))
                shutil.rmtree(dir_path, ignore_errors=True)

                self.process_downloaded_archive(matched_file_hath[3])

        # Torrent downloads
        if len(files_torrent) > 0:
            self.set_current_dir(self.settings.ftps['remote_torrent_dir'])
            self.ftps.encoding = 'utf8'
            files_matched_torrent = []
            for line in self.ftps.mlsd(facts=["type", "size"]):
                if not line[0]:
                    continue
                if 'type' not in line[1]:
                    continue
                if line[1]["type"] != 'dir' and line[1]["type"] != 'file':
                    continue
                for archive in files_torrent:
                    if archive.gallery:
                        cleaned_torrent_name = os.path.splitext(
                            os.path.basename(archive.zipped.path))[0].replace(' [' + archive.gallery.gid + ']', '')
                    else:
                        cleaned_torrent_name = os.path.splitext(os.path.basename(archive.zipped.path))[0]
                    if replace_illegal_name(os.path.splitext(line[0])[0]) in cleaned_torrent_name:
                        if line[1]["type"] == 'dir':
                            files_matched_torrent.append((line[0], line[1]["type"], 0, archive))
                        else:
                            files_matched_torrent.append((line[0], line[1]["type"], int(line[1]["size"]), archive))
            for matched_file_torrent in files_matched_torrent:
                if matched_file_torrent[1] == 'dir':
                    dir_path = mkdtemp()
                    remote_ftp_files = list(self.ftps.mlsd(path=matched_file_torrent[0], facts=["type", "size"]))
                    self.current_download['total'] = len(remote_ftp_files)
                    self.logger.info(
                        "For archive: {archive}, downloading and creating zip "
                        "for folder {filename}, {image_count} images".format(
                            archive=matched_file_torrent[3],
                            filename=matched_file_torrent[0],
                            image_count=len(remote_ftp_files)
                        ))
                    for count, img_file_tuple in enumerate(remote_ftp_files):
                        if img_file_tuple[1]["type"] != 'file':
                            continue
                        for retry_count in range(10):
                            try:
                                with open(os.path.join(dir_path, img_file_tuple[0]), "wb") as file:
                                    self.current_download['index'] = count
                                    self.write_file_update_progress(
                                        'RETR %s' % (str(matched_file_torrent[0]) + "/" + img_file_tuple[0]),
                                        file.write,
                                        int(img_file_tuple[1]["size"])
                                    )
                            except (ConnectionResetError, socket.timeout, TimeoutError):
                                self.logger.error("Torrent download failed for folder, restarting connection...")
                                self.ftps.close()
                                self.start_connection()
                                self.set_current_dir(self.settings.ftps['remote_torrent_dir'])
                            else:
                                break
                    with ZipFile(matched_file_torrent[3].zipped.path, 'w') as archive_file:
                        for (root_path, _, file_names) in os.walk(dir_path):
                            for current_file in file_names:
                                archive_file.write(
                                    os.path.join(root_path, current_file), arcname=os.path.basename(current_file))
                    shutil.rmtree(dir_path, ignore_errors=True)
                else:
                    self.logger.info(
                        "For archive: {archive} downloading remote file: {remote} to local file: {local}".format(
                            archive=matched_file_torrent[3],
                            remote=matched_file_torrent[0],
                            local=matched_file_torrent[3].zipped.path
                        )
                    )
                    self.current_download['total'] = 1
                    for retry_count in range(10):
                        try:
                            with open(matched_file_torrent[3].zipped.path, "wb") as file:
                                self.current_download['index'] = 1
                                self.write_file_update_progress(
                                    'RETR %s' % matched_file_torrent[0], file.write, matched_file_torrent[2])
                        except (ConnectionResetError, socket.timeout, TimeoutError):
                            self.logger.error("Torrent download failed for archive, restarting connection...")
                            self.ftps.close()
                            self.start_connection()
                            self.set_current_dir(self.settings.ftps['remote_torrent_dir'])
                        else:
                            break
                    if self.settings.convert_rar_to_zip and os.path.splitext(matched_file_torrent[0])[1].lower() == ".rar":
                        self.logger.info(
                            "For archive: {}, converting rar: {} to zip".format(
                                matched_file_torrent[3],
                                matched_file_torrent[3].zipped.path
                            )
                        )
                        convert_rar_to_zip(matched_file_torrent[3].zipped.path)

                self.process_downloaded_archive(matched_file_torrent[3])

        self.ftps.close()
Ejemplo n.º 14
0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.link:
            return

        if self.own_settings.megadl_executable_path:
            exe_path_to_use = shutil.which(
                self.own_settings.megadl_executable_path)
        else:
            exe_path_to_use = shutil.which(
                self.own_settings.megadl_executable_name)

        if not exe_path_to_use:
            self.return_code = 0
            self.logger.error("The megadl tools was not found")
            return

        directory_path = mkdtemp()

        arguments = [
            "--no-progress", "--print-names", "--path",
            "{}".format(directory_path)
        ]

        if self.own_settings.proxy:
            arguments.append("--proxy")
            arguments.append("{}".format(self.own_settings.proxy))

        if self.own_settings.extra_megadl_arguments:
            arguments.append("{}".format(
                self.own_settings.extra_megadl_arguments))

        arguments.append("{}".format(self.gallery.link))

        self.logger.info("Calling megadl: {}.".format(" ".join(
            [exe_path_to_use, *arguments])))

        process_result = subprocess.run([exe_path_to_use, *arguments],
                                        stdout=subprocess.PIPE,
                                        stderr=subprocess.PIPE,
                                        universal_newlines=True)

        message_text = process_result.stdout

        if not message_text:
            self.return_code = 0
            self.logger.error(
                "The link could not be downloaded, no output was generated after running megadl"
            )
            return

        if process_result.stderr:
            self.return_code = 0
            self.logger.error(
                "An error was captured when running megadl: {}".format(
                    process_result.stderr))
            return

        if "WARNING: Skipping invalid" in message_text:
            self.return_code = 0
            self.logger.error(
                "The link could not be downloaded: {}".format(message_text))
            return

        # If we downloaded a folder, just take the first result
        file_names = message_text.splitlines()
        file_name = file_names[0]

        output_path = os.path.join(directory_path, file_name)

        if not os.path.isfile(output_path):
            self.return_code = 0
            self.logger.error(
                "The resulting download file was not found: {}".format(
                    file_name))
            return

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(self.own_settings.archive_dl_folder,
                         replace_illegal_name(file_name)))

        self.gallery.title = os.path.splitext(file_name)[0]

        filepath = os.path.join(self.settings.MEDIA_ROOT,
                                self.gallery.filename)

        shutil.move(output_path, filepath)
        shutil.rmtree(directory_path, ignore_errors=True)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
            filepath)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(filepath)

            self.fileDownloaded = 1
            self.return_code = 1

        else:
            self.logger.error("Could not download archive")
            self.return_code = 0
Ejemplo n.º 15
0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.link:
            return

        to_use_filename = get_base_filename_string_from_gallery_data(self.gallery)

        to_use_filename = replace_illegal_name(to_use_filename)

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(
                self.own_settings.archive_dl_folder,
                to_use_filename + '.zip'))
        if self.gallery.content:
            soup_1 = BeautifulSoup(self.gallery.content, 'html.parser')
        else:
            request_dict = construct_request_dict(self.settings, self.own_settings)
            gallery_page = requests.get(
                self.gallery.link,
                **request_dict
            )
            soup_1 = BeautifulSoup(gallery_page.content, 'html.parser')

        gallery_read = soup_1.find("a", {"class": "x-btn-rounded"})['href']

        # Some URLs are really bad formatted
        gallery_read = re.sub(
            r'.*(' + re.escape(constants.main_page) + r'/manga/read/.+/0/1/).*', r'\1',
            gallery_read,
            flags=re.DOTALL
        )

        if not gallery_read or gallery_read in constants.bad_urls or not gallery_read.startswith(constants.main_page):
            logger.warning("Reading gallery page not available, trying to guess the name.")
            gallery_read = guess_gallery_read_url(self.gallery.link, self.gallery)

        if not gallery_read.endswith('page/1'):
            gallery_read += 'page/1'

        page_regex = re.compile(r"(.*?page/)(\d+)/*$", re.IGNORECASE)

        last_image = ''

        directory_path = mkdtemp()

        logger.info('Downloading gallery: {}'.format(self.gallery.title))

        second_pass = False
        while True:

            try:
                request_dict = construct_request_dict(self.settings, self.own_settings)
                gallery_read_page = requests.get(
                    gallery_read,
                    **request_dict
                )
            except requests.exceptions.MissingSchema:
                logger.error("Malformed URL: {}, skipping".format(gallery_read))
                self.return_code = 0
                shutil.rmtree(directory_path, ignore_errors=True)
                return

            if gallery_read_page.status_code == 404:
                if gallery_read.endswith('page/1'):
                    if not second_pass:
                        gallery_read = guess_gallery_read_url(self.gallery.link, self.gallery, False)
                        second_pass = True
                        continue
                    logger.error("Last page was the first one: {}, stopping".format(gallery_read))
                    self.return_code = 0
                    shutil.rmtree(directory_path, ignore_errors=True)
                    return
                # yield("Got to last gallery page, stopping")
                break

            soup_2 = BeautifulSoup(gallery_read_page.content, 'html.parser')
            img_find = soup_2.find("img", {"class": "open"})

            if not img_find:
                logger.error("Gallery not available, skipping")
                self.return_code = 0
                shutil.rmtree(directory_path, ignore_errors=True)
                return

            img = img_find['src']

            if last_image != '' and last_image == img:
                # yield('Current image is the same as previous, skipping')
                break
            last_image = img
            img_name = os.path.basename(img)
            request_dict = construct_request_dict(self.settings, self.own_settings)
            request_file = requests.get(
                img,
                **request_dict
            )
            if request_file.status_code == 404:
                # yield("Got to last image, stopping")
                break
            with open(os.path.join(directory_path, img_name), "wb") as fo:
                for chunk in request_file.iter_content(4096):
                    fo.write(chunk)

            page_match = page_regex.search(gallery_read)

            if page_match:
                gallery_read = page_match.group(1) + str(int(page_match.group(2)) + 1)
            else:
                # yield("Could not match to change page, stopping")
                break

        file_path = os.path.join(
            self.settings.MEDIA_ROOT,
            self.gallery.filename
        )

        with ZipFile(file_path, 'w') as archive:
            for (root_path, _, file_names) in os.walk(directory_path):
                for current_file in file_names:
                    archive.write(
                        os.path.join(root_path, current_file), arcname=os.path.basename(current_file))
        shutil.rmtree(directory_path, ignore_errors=True)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(file_path)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(file_path)
            self.fileDownloaded = 1
            self.return_code = 1
Ejemplo n.º 16
0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.link:
            return

        to_use_filename = get_base_filename_string_from_gallery_data(self.gallery)

        to_use_filename = replace_illegal_name(to_use_filename)

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(
                self.own_settings.archive_dl_folder,
                to_use_filename + '.zip'))
        if self.gallery.content:
            soup_1 = BeautifulSoup(self.gallery.content, 'html.parser')
        else:
            request_dict = construct_request_dict(self.settings, self.own_settings)
            gallery_page = requests.get(
                self.gallery.link,
                **request_dict
            )
            soup_1 = BeautifulSoup(gallery_page.content, 'html.parser')

        gallery_read = soup_1.find("a", {"class": "x-btn-rounded"})['href']

        # Some URLs are really bad formatted
        gallery_read = re.sub(
            r'.*(' + re.escape(constants.main_page) + r'/manga/read/.+/0/1/).*', r'\1',
            gallery_read,
            flags=re.DOTALL
        )

        if not gallery_read or gallery_read in constants.bad_urls or not gallery_read.startswith(constants.main_page):
            logger.warning("Reading gallery page not available, trying to guess the name.")
            gallery_read = guess_gallery_read_url(self.gallery.link, self.gallery)

        if not gallery_read.endswith('page/1'):
            gallery_read += 'page/1'

        logger.info('Downloading gallery: {}'.format(self.gallery.title))

        try:
            request_dict = construct_request_dict(self.settings, self.own_settings)
            gallery_read_page = requests.get(
                gallery_read,
                **request_dict
            )
        except requests.exceptions.MissingSchema:
            logger.error("Malformed URL: {}, skipping".format(gallery_read))
            self.return_code = 0
            return

        if gallery_read_page.status_code != 200:
            gallery_read = guess_gallery_read_url(self.gallery.link, self.gallery, False)
            try:
                request_dict = construct_request_dict(self.settings, self.own_settings)
                gallery_read_page = requests.get(
                    gallery_read,
                    **request_dict
                )
            except requests.exceptions.MissingSchema:
                logger.error("Malformed URL: {}, skipping".format(gallery_read))
                self.return_code = 0
                return

        if gallery_read_page.status_code == 200:

            image_urls = self.get_img_urls_from_gallery_read_page(gallery_read_page.text)

            if not image_urls:
                logger.error("Could not find image links, archive not downloaded")
                self.return_code = 0
                return

            directory_path = mkdtemp()

            for image_url in image_urls:
                img_name = os.path.basename(image_url)

                request_dict = construct_request_dict(self.settings, self.own_settings)
                request_file = requests.get(
                    image_url,
                    **request_dict
                )
                if request_file.status_code == 404:
                    logger.warning("Image link reported 404 error, stopping")
                    break
                with open(os.path.join(directory_path, img_name), "wb") as fo:
                    for chunk in request_file.iter_content(4096):
                        fo.write(chunk)

            file_path = os.path.join(
                self.settings.MEDIA_ROOT,
                self.gallery.filename
            )

            with ZipFile(file_path, 'w') as archive:
                for (root_path, _, file_names) in os.walk(directory_path):
                    for current_file in file_names:
                        archive.write(
                            os.path.join(root_path, current_file), arcname=os.path.basename(current_file))
            shutil.rmtree(directory_path, ignore_errors=True)

            self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(file_path)
            if self.gallery.filesize > 0:
                self.crc32 = calc_crc32(file_path)
                self.fileDownloaded = 1
                self.return_code = 1
        else:
            logger.error("Wrong HTML code returned, could not download, link: {}".format(gallery_read))
            self.return_code = 0
Ejemplo n.º 17
0
    def start_download(self) -> None:

        if not self.gallery:
            return

        to_use_filename = get_base_filename_string_from_gallery_data(
            self.gallery)

        to_use_filename = replace_illegal_name(to_use_filename)

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(self.own_settings.archive_dl_folder,
                         to_use_filename + '.zip'))

        if not (self.gallery.root and self.gallery.gid and self.gallery.token
                and self.gallery.archiver_key):
            logger.error(
                'Missing required data -> root: {}, gid: {}, token: {}, archiver_key: {}.'
                .format(
                    self.gallery.root,
                    self.gallery.gid,
                    self.gallery.token,
                    self.gallery.archiver_key,
                ))
            self.return_code = 0
            return

        r = self.request_archive_download(self.gallery.root, self.gallery.gid,
                                          self.gallery.token,
                                          self.gallery.archiver_key)

        if not r:
            logger.error('Could not get download link.')
            self.return_code = 0
            return

        r.encoding = 'utf-8'

        if 'Invalid archiver key' in r.text:
            logger.error("Invalid archiver key received.")
            self.return_code = 0
        else:

            archive_link = get_archive_link_from_html_page(r.text)

            if archive_link == '':
                logger.error(
                    'Could not find archive link, page text: {}'.format(
                        r.text))
                self.return_code = 0
            else:
                m = re.match(r"(.*?)(\?.*?)", archive_link)
                if m:
                    archive_link = m.group(1)

                logger.info('Got link: {}, from url: {}'.format(
                    archive_link, r.url))

                request_dict = construct_request_dict(self.settings,
                                                      self.own_settings)

                request_file = requests.get(archive_link + '?start=1',
                                            stream='True',
                                            **request_dict)

                if r and r.status_code == 200:
                    logger.info(
                        'Downloading gallery: {}.zip'.format(to_use_filename))
                    filepath = os.path.join(self.settings.MEDIA_ROOT,
                                            self.gallery.filename)
                    with open(filepath, 'wb') as fo:
                        for chunk in request_file.iter_content(4096):
                            fo.write(chunk)

                    self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
                        filepath)
                    if self.gallery.filesize > 0:
                        self.crc32 = calc_crc32(filepath)

                        self.fileDownloaded = 1
                        self.return_code = 1

                else:
                    logger.error("Could not download archive")
                    self.return_code = 0
Ejemplo n.º 18
0
    def start_crawling(self, arg_line: List[str]) -> None:

        args = self.get_args(arg_line)

        if isinstance(args, ArgumentParserError):
            self.logger.info(str(args))
            return

        files = []
        do_not_replace = False
        values: DataDict = {}

        if args.remove_missing_files:
            found_archives = Archive.objects.all()

            if found_archives:
                self.logger.info("Checking {} archives for existence in filesystem".format(found_archives.count()))
                for archive in found_archives:
                    if not os.path.isfile(archive.zipped.path):
                        Archive.objects.delete_by_filter(
                            pk=archive.pk)

            return
        elif args.display_missing_files:

            found_archives = Archive.objects.all()

            if found_archives:
                self.logger.info("Checking {} archives for existence in filesystem".format(found_archives.count()))
                for archive in found_archives:
                    if not os.path.isfile(archive.zipped.path):
                        self.logger.info("Filename: {} doesn't exist".format(archive.zipped.path))
            return
        elif args.rematch_non_matches:
            self.settings.rematch_file_list = ['non-match']
            self.settings.rematch_file = True
            found_archives = Archive.objects.filter(
                match_type='non-match')
            if found_archives:
                self.logger.info("Scanning {} archives with non-matches".format(found_archives.count()))
                for archive in found_archives:
                    if os.path.isfile(archive.zipped.path):
                        files.append(archive.zipped.path)

        elif args.rematch_by_match_type:
            self.settings.rematch_file_list = [args.rematch_by_match_type]
            self.settings.rematch_file = True
            self.settings.replace_metadata = True
            found_archives = Archive.objects.filter(
                match_type=args.rematch_by_match_type)
            if found_archives:
                self.logger.info("Scanning {} archives matched by {}".format(
                    found_archives.count(), args.rematch_by_match_type
                ))
                for archive in found_archives:
                    if os.path.isfile(archive.zipped.path):
                        files.append(archive.zipped.path)
        elif args.rematch_wrong_filesize:
            self.settings.rematch_file = True
            self.settings.replace_metadata = True
            do_not_replace = True
            found_archives = Archive.objects.exclude(
                match_type='non-match', gallery_id__isnull=True)
            if found_archives:
                for archive in found_archives:
                    if not os.path.isfile(archive.zipped.path):
                        continue
                    if archive.filesize == archive.gallery.filesize:
                        continue
                    files.append(archive.zipped.path)
                self.logger.info("Scanning {} archives matched with wrong filesize".format(len(files)))
        elif args.recalc_missing_crc32:

            found_archives = Archive.objects.filter(crc32='')

            if found_archives:
                self.logger.info("Calculating {} archives with missing CRC32".format(found_archives.count()))
                for cnt, archive in enumerate(found_archives):
                    if os.path.isfile(archive.zipped.path):
                        crc32 = calc_crc32(
                            archive.zipped.path)
                        self.logger.info("Working on archive {} of {}, CRC32: {}".format((cnt + 1), found_archives.count(), crc32))
                        values = {'crc32': crc32}
                        Archive.objects.add_or_update_from_values(
                            values, pk=archive.pk)
                    else:
                        self.logger.info("Archive {} of {}, path: {} does not exist".format(
                            (cnt + 1),
                            found_archives.count(),
                            archive.zipped.path
                        ))
            return
        elif args.all_filenames_to_title:

            archives_title_gid = Archive.objects.exclude(
                title='')

            if archives_title_gid:
                self.logger.info("Checking {} galleries".format(archives_title_gid.count()))
                for cnt, archive in enumerate(archives_title_gid):
                    current_path = os.path.join(os.path.dirname(
                        archive.zipped.path), replace_illegal_name(archive.title) + '.zip')

                    if archive.zipped.path != current_path and not os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, current_path)):
                        self.logger.info("Filename should be {} but it's {}".format(current_path, archive.zipped.path))
                        if args.filename_to_title == 'rename':
                            os.rename(archive.zipped.path, os.path.join(
                                self.settings.MEDIA_ROOT, current_path))
                            values = {'zipped': current_path,
                                      }
                            Archive.objects.add_or_update_from_values(
                                values, pk=archive.pk)
            return

        elif args.rematch_from_internal_gallery_titles:

            non_matched_archives = Archive.objects.filter(
                match_type='non-match')

            if non_matched_archives:

                archives_title_gid, galleries_title_gid = self.get_archive_and_gallery_titles()

                self.logger.info("Matching against archive and gallery database, {} archives with no match".format(non_matched_archives.count()))
                for archive in non_matched_archives:
                    adjusted_title = replace_illegal_name(
                        os.path.basename(archive.zipped.path)).replace(".zip", "")

                    galleries_id_token = get_closer_gallery_title_from_list(
                        adjusted_title, galleries_title_gid, args.rematch_from_internal_gallery_titles)
                    if galleries_id_token is not None:
                        self.logger.info("Path: {}\nGal title: {}".format(adjusted_title, galleries_id_token[0]))
                        values = {
                            'title': Gallery.objects.filter(id=galleries_id_token[1])[0].title,
                            'title_jpn': Gallery.objects.filter(id=galleries_id_token[1])[0].title_jpn,
                            'zipped': archive.zipped.path,
                            'crc32': archive.crc32,
                            'match_type': 'gallery_database',
                            'filesize': archive.filesize,
                            'filecount': archive.filecount,
                            'gallery_id': galleries_id_token[1]
                        }
                        Archive.objects.add_or_update_from_values(
                            values, pk=archive.pk)
                        Gallery.objects.update_by_dl_type(
                            {"dl_type": "folder:filename"}, galleries_id_token[1], "failed")
                    else:
                        galleries_id_token = get_closer_gallery_title_from_list(
                            adjusted_title, archives_title_gid, args.rematch_from_internal_gallery_titles)
                        if galleries_id_token is not None:
                            self.logger.info("Path: {}\nMatch title: {}".format(adjusted_title, galleries_id_token[0]))
                            values = {
                                'title': Gallery.objects.filter(id=galleries_id_token[1])[0].title,
                                'title_jpn': Gallery.objects.filter(id=galleries_id_token[1])[0].title_jpn,
                                'zipped': archive.zipped.path,
                                'crc32': archive.crc32,
                                'match_type': archive.match_type,
                                'filesize': archive.filesize,
                                'filecount': archive.filecount,
                                'gallery_id': galleries_id_token[1]
                            }
                            Archive.objects.add_or_update_from_values(
                                values, pk=archive.pk)

            return

        elif args.display_match_from_internal_gallery_titles:

            non_matched_archives = Archive.objects.filter(
                match_type='non-match')

            if non_matched_archives:

                archives_title_gid, galleries_title_gid = self.get_archive_and_gallery_titles()

                self.logger.info("Matching against archive and gallery database, {} archives with no match".format(non_matched_archives.count()))
                for archive in non_matched_archives:
                    adjusted_title = replace_illegal_name(
                        os.path.basename(archive.zipped.path)).replace(".zip", "")
                    galleries_id_token = get_closer_gallery_title_from_list(
                        adjusted_title, galleries_title_gid, args.display_match_from_internal_gallery_titles)
                    if galleries_id_token is not None:
                        self.logger.info("Path: {}\nGal title: {}".format(adjusted_title, galleries_id_token[0]))
                    else:
                        galleries_id_token = get_closer_gallery_title_from_list(
                            adjusted_title, archives_title_gid, args.display_match_from_internal_gallery_titles)
                        if galleries_id_token is not None:
                            self.logger.info("Path: {}\nMatch title: {}".format(adjusted_title, galleries_id_token[0]))

            return
        else:
            for folder in args.folder:
                p = os.path.normpath(os.path.join(self.settings.MEDIA_ROOT, folder))
                if not p.startswith(self.settings.MEDIA_ROOT):
                    continue
                folder = os.path.relpath(p, self.settings.MEDIA_ROOT).replace("\\", "/")
                if os.path.isdir(os.path.join(self.settings.MEDIA_ROOT, folder)):
                    for root, _, filenames in os.walk(os.path.join(self.settings.MEDIA_ROOT, str(folder))):
                        for filename in fnmatch.filter(filenames, self.settings.filename_filter):
                            files.append(
                                os.path.relpath(os.path.join(root, filename), self.settings.MEDIA_ROOT))
                elif os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, folder)):
                    files.append(folder)

        if args.rename_to_title:
            self.logger.info("Checking {} galleries".format(len(files)))
            for cnt, filepath in enumerate(files):

                archive = Archive.objects.filter(zipped=filepath).first()

                if archive:
                    current_path = os.path.join(
                        os.path.dirname(filepath), replace_illegal_name(archive.title) + '.zip')

                    if filepath != current_path and not os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, current_path)):
                        self.logger.info("Filename should be {} but it's {}".format(current_path, filepath))
                        if args.rename_to_title == 'rename':
                            os.rename(os.path.join(self.settings.MEDIA_ROOT, filepath), os.path.join(
                                self.settings.MEDIA_ROOT, current_path))
                            values = {'zipped': current_path,
                                      }
                            Archive.objects.add_or_update_from_values(
                                values, zipped=filepath)
            return

        if args.set_reason:
            self.settings.archive_reason = args.set_reason

        if args.set_source:
            self.settings.archive_source = args.set_source

        # The creation of the files list ends here. From here onwards, it's processing them.

        if len(files) == 0:
            self.logger.info("No file matching needed, skipping matchers")
        else:
            self.logger.info("Starting checks for {} archives".format(len(files)))

            matchers_list = self.settings.provider_context.get_matchers(self.settings, logger=self.logger)
            for matcher in matchers_list:
                self.logger.info("Using matcher {} with a priority of {}".format(matcher[0].name, matcher[1]))

            for cnt, filepath in enumerate(files):

                self.logger.info("Checking file: {} of {}, path: {}".format((cnt + 1), len(files), filepath))

                title = re.sub(
                    '[_]', ' ', os.path.splitext(os.path.basename(filepath))[0])
                archive = Archive.objects.filter(zipped=filepath).first()
                if not self.settings.rehash_files and archive:
                    crc32 = archive.crc32
                else:
                    crc32 = calc_crc32(
                        os.path.join(self.settings.MEDIA_ROOT, filepath))

                if archive:
                    if args.force_rematch:
                        self.logger.info("Doing a forced rematch")
                    elif archive.match_type in self.settings.rematch_file_list or args.rematch_wrong_filesize:
                        if self.settings.rematch_file:
                            self.logger.info("File was already matched before, but rematch is ordered")
                        else:
                            self.logger.info("File was already matched before, not rematching")
                            continue
                    else:
                        self.logger.info("Match already saved, skipping")
                        continue
                else:
                    # Test for corrupt files
                    except_at_open = False
                    return_error = None
                    try:
                        my_zip = ZipFile(
                            os.path.join(self.settings.MEDIA_ROOT, filepath), 'r')
                        return_error = my_zip.testzip()
                        my_zip.close()
                    except (BadZipFile, NotImplementedError):
                        except_at_open = True
                    if except_at_open or return_error:
                        self.logger.warning("File check on zipfile failed on file: {}, marking as corrupt.".format(filepath))
                        values = {
                            'title': title,
                            'title_jpn': '',
                            'zipped': filepath,
                            'crc32': crc32,
                            'match_type': 'corrupt',
                            'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                            'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                            'source_type': 'folder'
                        }
                        if self.settings.archive_reason:
                            values.update({'reason': self.settings.archive_reason})
                        if self.settings.archive_details:
                            values.update({'details': self.settings.archive_details})
                        if self.settings.archive_source:
                            values.update({'source_type': self.settings.archive_source})
                        Archive.objects.update_or_create_by_values_and_gid(
                            values, None, zipped=filepath)
                        continue

                    # Look for previous matches
                    archive = Archive.objects.filter(crc32=crc32).first()
                    if archive:
                        if self.settings.copy_match_file:
                            self.logger.info("Found previous match by CRC32, copying its values")
                            values = {
                                'title': archive.title,
                                'title_jpn': archive.title_jpn,
                                'zipped': filepath,
                                'crc32': crc32,
                                'match_type': archive.match_type,
                                'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                                'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                                'gallery_id': archive.gallery_id,
                                'source_type': archive.source_type
                            }
                            if self.settings.archive_reason:
                                values.update({'reason': self.settings.archive_reason})
                            if self.settings.archive_details:
                                values.update({'details': self.settings.archive_details})
                            if self.settings.archive_source:
                                values.update({'source_type': self.settings.archive_source})
                            Archive.objects.add_or_update_from_values(
                                values, zipped=filepath)
                            continue
                        else:
                            self.logger.info("Matching independently and ignoring previous match")

                match_result = False

                start_time = time.perf_counter()

                match_type = ''
                match_title = ''
                match_link = ''
                match_count = 0

                for i, matcher in enumerate(matchers_list):
                    if i > 0:
                        time.sleep(self.settings.wait_timer)
                    self.logger.info("Matching with: {}".format(matcher[0]))
                    if matcher[0].start_match(filepath, crc32):
                        match_type = matcher[0].found_by
                        match_title = matcher[0].match_title or ''
                        match_link = matcher[0].match_link or ''
                        match_count = matcher[0].match_count
                        match_result = True
                        break

                end_time = time.perf_counter()

                self.logger.info("Time taken to match file {}: {:.2f} seconds.".format(filepath, (end_time - start_time)))

                if not match_result and not do_not_replace:
                    self.logger.info('Could not match with any matcher, adding as non-match.')

                    values = {
                        'title': title,
                        'title_jpn': '',
                        'zipped': filepath,
                        'crc32': crc32,
                        'match_type': 'non-match',
                        'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                        'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                        'source_type': 'folder'
                    }
                    if self.settings.archive_reason:
                        values.update({'reason': self.settings.archive_reason})
                    if self.settings.archive_details:
                        values.update({'details': self.settings.archive_details})
                    if self.settings.archive_source:
                        values.update({'source_type': self.settings.archive_source})
                    archive = Archive.objects.update_or_create_by_values_and_gid(
                        values, None, zipped=filepath)

                    if self.settings.internal_matches_for_non_matches:
                        self.logger.info('Generating possible internal matches.')

                        archive.generate_possible_matches(cutoff=0.4, clear_title=True)
                        self.logger.info('Generated matches for {}, found {}'.format(
                            archive.zipped.path,
                            archive.possible_matches.count()
                        ))
                elif match_result:
                    result_message = (
                        "Matched title: {}\n"
                        "Matched link: {}\n"
                        "Matched type: {}\n"
                        "Match count: {}\n".format(match_title, match_link, match_type, match_count)
                    )
                    self.logger.info(result_message)

        self.logger.info('Folder crawler done.')