def start_download(self) -> None:

        if not self.gallery or not self.gallery.link:
            return

        logger.info(
            "Downloading an archive from a generic HTTP server: {}".format(
                self.gallery.link))

        request_dict = construct_request_dict(self.settings, self.own_settings)

        request_file = requests.get(self.gallery.link,
                                    stream='True',
                                    **request_dict)

        filename = get_filename_from_cd(
            request_file.headers.get('content-disposition'))

        if not filename:
            if self.gallery.link.find('/'):
                filename = self.gallery.link.rsplit('/', 1)[1]

        if not filename:
            logger.error("Could not find a filename for link: {}".format(
                self.gallery.link))
            self.return_code = 0

        self.gallery.title = filename.replace(".zip", "")
        self.gallery.filename = replace_illegal_name(
            available_filename(
                self.settings.MEDIA_ROOT,
                os.path.join(self.own_settings.archive_dl_folder, filename)))

        filepath = os.path.join(self.settings.MEDIA_ROOT,
                                self.gallery.filename)
        with open(filepath, 'wb') as fo:
            for chunk in request_file.iter_content(4096):
                fo.write(chunk)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
            filepath)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(filepath)

            self.fileDownloaded = 1
            self.return_code = 1

        else:
            logger.error("Could not download archive")
            self.return_code = 0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.temp_archive:
            return

        logger.info(
            "Downloading an archive: {} from a Panda Backup-like source: {}".
            format(self.gallery.title, self.gallery.temp_archive['link']))

        to_use_filename = get_base_filename_string_from_gallery_data(
            self.gallery)

        to_use_filename = replace_illegal_name(to_use_filename)

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(self.own_settings.archive_dl_folder,
                         to_use_filename + '.zip'))  # TODO: File could be cbz.

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['stream'] = True
        request_file = request_with_retries(
            self.gallery.temp_archive['link'],
            request_dict,
        )
        if not request_file:
            logger.error("Could not download archive")
            self.return_code = 0
            return
        filepath = os.path.join(self.settings.MEDIA_ROOT,
                                self.gallery.filename)

        with open(filepath, 'wb') as fo:
            for chunk in request_file.iter_content(4096):
                fo.write(chunk)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
            filepath)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(filepath)

            self.fileDownloaded = 1
            self.return_code = 1

        else:
            logger.error("Could not download archive")
            self.return_code = 0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.link or not self.gallery.archiver_key:
            return

        to_use_filename = get_base_filename_string_from_gallery_data(
            self.gallery)

        to_use_filename = replace_illegal_name(to_use_filename)

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(self.own_settings.archive_dl_folder,
                         to_use_filename + '.zip'))

        request_dict = construct_request_dict(self.settings, self.own_settings)

        request_file = requests.get(self.gallery.archiver_key,
                                    stream='True',
                                    **request_dict)

        filepath = os.path.join(self.settings.MEDIA_ROOT,
                                self.gallery.filename)
        with open(filepath, 'wb') as fo:
            for chunk in request_file.iter_content(4096):
                fo.write(chunk)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
            filepath)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(filepath)

            self.fileDownloaded = 1
            self.return_code = 1

        else:
            logger.error("Could not download archive")
            os.remove(filepath)
            self.return_code = 0
Example #4
0
    def start_download(self) -> None:

        if not self.gallery:
            return

        self.logger.info(
            "Downloading an archive: {} from a Panda Backup-like source: {}".
            format(self.gallery.title, self.gallery.archiver_key['link']))

        self.gallery.title = replace_illegal_name(self.gallery.title)
        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(self.own_settings.archive_dl_folder,
                         self.gallery.title + '.zip'))

        request_file = requests.get(self.gallery.archiver_key['link'],
                                    stream='True',
                                    headers=self.settings.requests_headers,
                                    timeout=self.settings.timeout_timer,
                                    cookies=self.own_settings.cookies)

        filepath = os.path.join(self.settings.MEDIA_ROOT,
                                self.gallery.filename)
        with open(filepath, 'wb') as fo:
            for chunk in request_file.iter_content(4096):
                fo.write(chunk)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
            filepath)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(filepath)

            self.fileDownloaded = 1
            self.return_code = 1

        else:
            self.logger.error("Could not download archive")
            self.return_code = 0
Example #5
0
    def start_download(self) -> None:

        if not self.gallery:
            return

        to_use_filename = get_base_filename_string_from_gallery_data(
            self.gallery)

        to_use_filename = replace_illegal_name(to_use_filename)

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(self.own_settings.archive_dl_folder,
                         to_use_filename + '.zip'))

        if not (self.gallery.root and self.gallery.gid and self.gallery.token
                and self.gallery.archiver_key):
            logger.error(
                'Missing required data -> root: {}, gid: {}, token: {}, archiver_key: {}.'
                .format(
                    self.gallery.root,
                    self.gallery.gid,
                    self.gallery.token,
                    self.gallery.archiver_key,
                ))
            self.return_code = 0
            return

        r = self.request_archive_download(self.gallery.root, self.gallery.gid,
                                          self.gallery.token,
                                          self.gallery.archiver_key)

        if not r:
            logger.error('Could not get download link.')
            self.return_code = 0
            return

        r.encoding = 'utf-8'

        if 'Invalid archiver key' in r.text:
            logger.error("Invalid archiver key received.")
            self.return_code = 0
        else:

            archive_link = get_archive_link_from_html_page(r.text)

            if archive_link == '':
                logger.error(
                    'Could not find archive link, page text: {}'.format(
                        r.text))
                self.return_code = 0
            else:
                m = re.match(r"(.*?)(\?.*?)", archive_link)
                if m:
                    archive_link = m.group(1)

                logger.info('Got link: {}, from url: {}'.format(
                    archive_link, r.url))

                request_dict = construct_request_dict(self.settings,
                                                      self.own_settings)

                request_file = requests.get(archive_link + '?start=1',
                                            stream='True',
                                            **request_dict)

                if r and r.status_code == 200:
                    logger.info(
                        'Downloading gallery: {}.zip'.format(to_use_filename))
                    filepath = os.path.join(self.settings.MEDIA_ROOT,
                                            self.gallery.filename)
                    with open(filepath, 'wb') as fo:
                        for chunk in request_file.iter_content(4096):
                            fo.write(chunk)

                    self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
                        filepath)
                    if self.gallery.filesize > 0:
                        self.crc32 = calc_crc32(filepath)

                        self.fileDownloaded = 1
                        self.return_code = 1

                else:
                    logger.error("Could not download archive")
                    self.return_code = 0
Example #6
0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.link:
            return

        if self.settings.gallery_dl.executable_path:
            exe_path_to_use = shutil.which(
                self.settings.gallery_dl.executable_path)
        else:
            exe_path_to_use = shutil.which(
                self.settings.gallery_dl.executable_name)

        if not exe_path_to_use:
            self.return_code = 0
            logger.error("The gallery-dl executable was not found")
            return

        directory_path = mkdtemp()

        arguments = ["--zip", "--dest", "{}".format(directory_path)]

        if self.own_settings.proxy:
            arguments.append("--proxy")
            arguments.append("{}".format(self.own_settings.proxy))

        if self.settings.gallery_dl.config_file:
            arguments.append("--config")
            arguments.append("{}".format(self.settings.gallery_dl.config_file))

        if self.settings.gallery_dl.extra_arguments:
            arguments.append("{}".format(
                self.settings.gallery_dl.extra_arguments))

        arguments.append("{}".format(self.gallery.link))

        logger.info("Calling gallery-dl: {}.".format(" ".join(
            [exe_path_to_use, *arguments])))

        process_result = subprocess.run([exe_path_to_use, *arguments],
                                        stdout=subprocess.PIPE,
                                        stderr=subprocess.PIPE,
                                        universal_newlines=True)

        if process_result.stderr:
            self.return_code = 0
            logger.error(
                "An error was captured when running gallery-dl: {}".format(
                    process_result.stderr))
            return

        if process_result.returncode != 0:
            self.return_code = 0
            logger.error("Return code was not 0: {}".format(
                process_result.returncode))
            return

        # If we downloaded more than one file, get the latest one
        output_path = ''
        file_name = ''
        for (dir_path, dir_names, filenames) in os.walk(directory_path):
            for current_file in filenames:
                file_name = current_file
                output_path = os.path.join(dir_path, current_file)

        if not output_path:
            self.return_code = 0
            logger.error("The resulting download file was not found")
            return

        if not output_path or not os.path.isfile(output_path):
            self.return_code = 0
            logger.error(
                "The resulting download file was not found: {}".format(
                    file_name))
            return

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(self.own_settings.archive_dl_folder,
                         replace_illegal_name(file_name)))

        self.gallery.title = os.path.splitext(file_name)[0]

        filepath = os.path.join(self.settings.MEDIA_ROOT,
                                self.gallery.filename)

        shutil.move(output_path, filepath)
        shutil.rmtree(directory_path, ignore_errors=True)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
            filepath)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(filepath)

            self.fileDownloaded = 1
            self.return_code = 1

        else:
            logger.error("Could not download archive")
            self.return_code = 0
Example #7
0
 def process_downloaded_archive(self, archive: Archive) -> None:
     if os.path.isfile(archive.zipped.path):
         except_at_open = False
         return_error = None
         try:
             my_zip = ZipFile(
                 archive.zipped.path, 'r')
             return_error = my_zip.testzip()
             my_zip.close()
         except (BadZipFile, NotImplementedError):
             except_at_open = True
         if except_at_open or return_error:
             if 'panda' in archive.source_type:
                 self.logger.error(
                     "For archive: {}, file check on downloaded zipfile failed on file: {}, "
                     "forcing download as panda_archive to fix it.".format(archive, archive.zipped.path)
                 )
                 crc32 = calc_crc32(
                     archive.zipped.path)
                 Archive.objects.add_or_update_from_values({'crc32': crc32}, pk=archive.pk)
                 if self.web_queue and archive.gallery:
                     temp_settings = Settings(load_from_config=self.settings.config)
                     temp_settings.allow_downloaders_only(['panda_archive'], True, True, True)
                     self.web_queue.enqueue_args_list((archive.gallery.get_link(),), override_options=temp_settings)
                     return
             else:
                 self.logger.warning(
                     "For archive: {}, File check on downloaded zipfile: {}. "
                     "Check the file manually.".format(archive, archive.zipped.path)
                 )
         crc32 = calc_crc32(
             archive.zipped.path)
         filesize = get_zip_filesize(
             archive.zipped.path)
         filecount = filecount_in_zip(
             archive.zipped.path)
         values = {'crc32': crc32,
                   'filesize': filesize,
                   'filecount': filecount,
                   }
         updated_archive = Archive.objects.add_or_update_from_values(
             values, pk=archive.pk)
         if archive.gallery and updated_archive.filesize != updated_archive.gallery.filesize:
             if Archive.objects.filter(gallery=updated_archive.gallery, filesize=updated_archive.gallery.filesize):
                 self.logger.info(
                     "For archive: {} size does not match gallery, "
                     "but there's already another archive that matches.".format(updated_archive)
                 )
                 return
             if 'panda' in archive.source_type:
                 self.logger.info(
                     "For archive: {} size does not match gallery, "
                     "downloading again from panda_archive.".format(updated_archive)
                 )
                 if self.web_queue:
                     temp_settings = Settings(load_from_config=self.settings.config)
                     temp_settings.allow_downloaders_only(['panda_archive'], True, True, True)
                     self.web_queue.enqueue_args_list(
                         (updated_archive.gallery.get_link(), ),
                         override_options=temp_settings
                     )
             else:
                 self.logger.warning(
                     "For archive: {} size does not match gallery. Check the file manually.".format(archive)
                 )
Example #8
0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.link:
            return

        if self.own_settings.megadl_executable_path:
            exe_path_to_use = shutil.which(
                self.own_settings.megadl_executable_path)
        else:
            exe_path_to_use = shutil.which(
                self.own_settings.megadl_executable_name)

        if not exe_path_to_use:
            self.return_code = 0
            self.logger.error("The megadl tools was not found")
            return

        directory_path = mkdtemp()

        arguments = [
            "--no-progress", "--print-names", "--path",
            "{}".format(directory_path)
        ]

        if self.own_settings.proxy:
            arguments.append("--proxy")
            arguments.append("{}".format(self.own_settings.proxy))

        if self.own_settings.extra_megadl_arguments:
            arguments.append("{}".format(
                self.own_settings.extra_megadl_arguments))

        arguments.append("{}".format(self.gallery.link))

        self.logger.info("Calling megadl: {}.".format(" ".join(
            [exe_path_to_use, *arguments])))

        process_result = subprocess.run([exe_path_to_use, *arguments],
                                        stdout=subprocess.PIPE,
                                        stderr=subprocess.PIPE,
                                        universal_newlines=True)

        message_text = process_result.stdout

        if not message_text:
            self.return_code = 0
            self.logger.error(
                "The link could not be downloaded, no output was generated after running megadl"
            )
            return

        if process_result.stderr:
            self.return_code = 0
            self.logger.error(
                "An error was captured when running megadl: {}".format(
                    process_result.stderr))
            return

        if "WARNING: Skipping invalid" in message_text:
            self.return_code = 0
            self.logger.error(
                "The link could not be downloaded: {}".format(message_text))
            return

        # If we downloaded a folder, just take the first result
        file_names = message_text.splitlines()
        file_name = file_names[0]

        output_path = os.path.join(directory_path, file_name)

        if not os.path.isfile(output_path):
            self.return_code = 0
            self.logger.error(
                "The resulting download file was not found: {}".format(
                    file_name))
            return

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(self.own_settings.archive_dl_folder,
                         replace_illegal_name(file_name)))

        self.gallery.title = os.path.splitext(file_name)[0]

        filepath = os.path.join(self.settings.MEDIA_ROOT,
                                self.gallery.filename)

        shutil.move(output_path, filepath)
        shutil.rmtree(directory_path, ignore_errors=True)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
            filepath)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(filepath)

            self.fileDownloaded = 1
            self.return_code = 1

        else:
            self.logger.error("Could not download archive")
            self.return_code = 0
Example #9
0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.link:
            return

        to_use_filename = get_base_filename_string_from_gallery_data(self.gallery)

        to_use_filename = replace_illegal_name(to_use_filename)

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(
                self.own_settings.archive_dl_folder,
                to_use_filename + '.zip'))
        if self.gallery.content:
            soup_1 = BeautifulSoup(self.gallery.content, 'html.parser')
        else:
            request_dict = construct_request_dict(self.settings, self.own_settings)
            gallery_page = requests.get(
                self.gallery.link,
                **request_dict
            )
            soup_1 = BeautifulSoup(gallery_page.content, 'html.parser')

        gallery_read = soup_1.find("a", {"class": "x-btn-rounded"})['href']

        # Some URLs are really bad formatted
        gallery_read = re.sub(
            r'.*(' + re.escape(constants.main_page) + r'/manga/read/.+/0/1/).*', r'\1',
            gallery_read,
            flags=re.DOTALL
        )

        if not gallery_read or gallery_read in constants.bad_urls or not gallery_read.startswith(constants.main_page):
            logger.warning("Reading gallery page not available, trying to guess the name.")
            gallery_read = guess_gallery_read_url(self.gallery.link, self.gallery)

        if not gallery_read.endswith('page/1'):
            gallery_read += 'page/1'

        page_regex = re.compile(r"(.*?page/)(\d+)/*$", re.IGNORECASE)

        last_image = ''

        directory_path = mkdtemp()

        logger.info('Downloading gallery: {}'.format(self.gallery.title))

        second_pass = False
        while True:

            try:
                request_dict = construct_request_dict(self.settings, self.own_settings)
                gallery_read_page = requests.get(
                    gallery_read,
                    **request_dict
                )
            except requests.exceptions.MissingSchema:
                logger.error("Malformed URL: {}, skipping".format(gallery_read))
                self.return_code = 0
                shutil.rmtree(directory_path, ignore_errors=True)
                return

            if gallery_read_page.status_code == 404:
                if gallery_read.endswith('page/1'):
                    if not second_pass:
                        gallery_read = guess_gallery_read_url(self.gallery.link, self.gallery, False)
                        second_pass = True
                        continue
                    logger.error("Last page was the first one: {}, stopping".format(gallery_read))
                    self.return_code = 0
                    shutil.rmtree(directory_path, ignore_errors=True)
                    return
                # yield("Got to last gallery page, stopping")
                break

            soup_2 = BeautifulSoup(gallery_read_page.content, 'html.parser')
            img_find = soup_2.find("img", {"class": "open"})

            if not img_find:
                logger.error("Gallery not available, skipping")
                self.return_code = 0
                shutil.rmtree(directory_path, ignore_errors=True)
                return

            img = img_find['src']

            if last_image != '' and last_image == img:
                # yield('Current image is the same as previous, skipping')
                break
            last_image = img
            img_name = os.path.basename(img)
            request_dict = construct_request_dict(self.settings, self.own_settings)
            request_file = requests.get(
                img,
                **request_dict
            )
            if request_file.status_code == 404:
                # yield("Got to last image, stopping")
                break
            with open(os.path.join(directory_path, img_name), "wb") as fo:
                for chunk in request_file.iter_content(4096):
                    fo.write(chunk)

            page_match = page_regex.search(gallery_read)

            if page_match:
                gallery_read = page_match.group(1) + str(int(page_match.group(2)) + 1)
            else:
                # yield("Could not match to change page, stopping")
                break

        file_path = os.path.join(
            self.settings.MEDIA_ROOT,
            self.gallery.filename
        )

        with ZipFile(file_path, 'w') as archive:
            for (root_path, _, file_names) in os.walk(directory_path):
                for current_file in file_names:
                    archive.write(
                        os.path.join(root_path, current_file), arcname=os.path.basename(current_file))
        shutil.rmtree(directory_path, ignore_errors=True)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(file_path)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(file_path)
            self.fileDownloaded = 1
            self.return_code = 1
Example #10
0
    def start_download(self) -> None:

        if not self.gallery or not self.gallery.link:
            return

        to_use_filename = get_base_filename_string_from_gallery_data(self.gallery)

        to_use_filename = replace_illegal_name(to_use_filename)

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(
                self.own_settings.archive_dl_folder,
                to_use_filename + '.zip'))
        if self.gallery.content:
            soup_1 = BeautifulSoup(self.gallery.content, 'html.parser')
        else:
            request_dict = construct_request_dict(self.settings, self.own_settings)
            gallery_page = requests.get(
                self.gallery.link,
                **request_dict
            )
            soup_1 = BeautifulSoup(gallery_page.content, 'html.parser')

        gallery_read = soup_1.find("a", {"class": "x-btn-rounded"})['href']

        # Some URLs are really bad formatted
        gallery_read = re.sub(
            r'.*(' + re.escape(constants.main_page) + r'/manga/read/.+/0/1/).*', r'\1',
            gallery_read,
            flags=re.DOTALL
        )

        if not gallery_read or gallery_read in constants.bad_urls or not gallery_read.startswith(constants.main_page):
            logger.warning("Reading gallery page not available, trying to guess the name.")
            gallery_read = guess_gallery_read_url(self.gallery.link, self.gallery)

        if not gallery_read.endswith('page/1'):
            gallery_read += 'page/1'

        logger.info('Downloading gallery: {}'.format(self.gallery.title))

        try:
            request_dict = construct_request_dict(self.settings, self.own_settings)
            gallery_read_page = requests.get(
                gallery_read,
                **request_dict
            )
        except requests.exceptions.MissingSchema:
            logger.error("Malformed URL: {}, skipping".format(gallery_read))
            self.return_code = 0
            return

        if gallery_read_page.status_code != 200:
            gallery_read = guess_gallery_read_url(self.gallery.link, self.gallery, False)
            try:
                request_dict = construct_request_dict(self.settings, self.own_settings)
                gallery_read_page = requests.get(
                    gallery_read,
                    **request_dict
                )
            except requests.exceptions.MissingSchema:
                logger.error("Malformed URL: {}, skipping".format(gallery_read))
                self.return_code = 0
                return

        if gallery_read_page.status_code == 200:

            image_urls = self.get_img_urls_from_gallery_read_page(gallery_read_page.text)

            if not image_urls:
                logger.error("Could not find image links, archive not downloaded")
                self.return_code = 0
                return

            directory_path = mkdtemp()

            for image_url in image_urls:
                img_name = os.path.basename(image_url)

                request_dict = construct_request_dict(self.settings, self.own_settings)
                request_file = requests.get(
                    image_url,
                    **request_dict
                )
                if request_file.status_code == 404:
                    logger.warning("Image link reported 404 error, stopping")
                    break
                with open(os.path.join(directory_path, img_name), "wb") as fo:
                    for chunk in request_file.iter_content(4096):
                        fo.write(chunk)

            file_path = os.path.join(
                self.settings.MEDIA_ROOT,
                self.gallery.filename
            )

            with ZipFile(file_path, 'w') as archive:
                for (root_path, _, file_names) in os.walk(directory_path):
                    for current_file in file_names:
                        archive.write(
                            os.path.join(root_path, current_file), arcname=os.path.basename(current_file))
            shutil.rmtree(directory_path, ignore_errors=True)

            self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(file_path)
            if self.gallery.filesize > 0:
                self.crc32 = calc_crc32(file_path)
                self.fileDownloaded = 1
                self.return_code = 1
        else:
            logger.error("Wrong HTML code returned, could not download, link: {}".format(gallery_read))
            self.return_code = 0
Example #11
0
    def start_crawling(self, arg_line: List[str]) -> None:

        args = self.get_args(arg_line)

        if isinstance(args, ArgumentParserError):
            self.logger.info(str(args))
            return

        files = []
        do_not_replace = False
        values: DataDict = {}

        if args.remove_missing_files:
            found_archives = Archive.objects.all()

            if found_archives:
                self.logger.info("Checking {} archives for existence in filesystem".format(found_archives.count()))
                for archive in found_archives:
                    if not os.path.isfile(archive.zipped.path):
                        Archive.objects.delete_by_filter(
                            pk=archive.pk)

            return
        elif args.display_missing_files:

            found_archives = Archive.objects.all()

            if found_archives:
                self.logger.info("Checking {} archives for existence in filesystem".format(found_archives.count()))
                for archive in found_archives:
                    if not os.path.isfile(archive.zipped.path):
                        self.logger.info("Filename: {} doesn't exist".format(archive.zipped.path))
            return
        elif args.rematch_non_matches:
            self.settings.rematch_file_list = ['non-match']
            self.settings.rematch_file = True
            found_archives = Archive.objects.filter(
                match_type='non-match')
            if found_archives:
                self.logger.info("Scanning {} archives with non-matches".format(found_archives.count()))
                for archive in found_archives:
                    if os.path.isfile(archive.zipped.path):
                        files.append(archive.zipped.path)

        elif args.rematch_by_match_type:
            self.settings.rematch_file_list = [args.rematch_by_match_type]
            self.settings.rematch_file = True
            self.settings.replace_metadata = True
            found_archives = Archive.objects.filter(
                match_type=args.rematch_by_match_type)
            if found_archives:
                self.logger.info("Scanning {} archives matched by {}".format(
                    found_archives.count(), args.rematch_by_match_type
                ))
                for archive in found_archives:
                    if os.path.isfile(archive.zipped.path):
                        files.append(archive.zipped.path)
        elif args.rematch_wrong_filesize:
            self.settings.rematch_file = True
            self.settings.replace_metadata = True
            do_not_replace = True
            found_archives = Archive.objects.exclude(
                match_type='non-match', gallery_id__isnull=True)
            if found_archives:
                for archive in found_archives:
                    if not os.path.isfile(archive.zipped.path):
                        continue
                    if archive.filesize == archive.gallery.filesize:
                        continue
                    files.append(archive.zipped.path)
                self.logger.info("Scanning {} archives matched with wrong filesize".format(len(files)))
        elif args.recalc_missing_crc32:

            found_archives = Archive.objects.filter(crc32='')

            if found_archives:
                self.logger.info("Calculating {} archives with missing CRC32".format(found_archives.count()))
                for cnt, archive in enumerate(found_archives):
                    if os.path.isfile(archive.zipped.path):
                        crc32 = calc_crc32(
                            archive.zipped.path)
                        self.logger.info("Working on archive {} of {}, CRC32: {}".format((cnt + 1), found_archives.count(), crc32))
                        values = {'crc32': crc32}
                        Archive.objects.add_or_update_from_values(
                            values, pk=archive.pk)
                    else:
                        self.logger.info("Archive {} of {}, path: {} does not exist".format(
                            (cnt + 1),
                            found_archives.count(),
                            archive.zipped.path
                        ))
            return
        elif args.all_filenames_to_title:

            archives_title_gid = Archive.objects.exclude(
                title='')

            if archives_title_gid:
                self.logger.info("Checking {} galleries".format(archives_title_gid.count()))
                for cnt, archive in enumerate(archives_title_gid):
                    current_path = os.path.join(os.path.dirname(
                        archive.zipped.path), replace_illegal_name(archive.title) + '.zip')

                    if archive.zipped.path != current_path and not os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, current_path)):
                        self.logger.info("Filename should be {} but it's {}".format(current_path, archive.zipped.path))
                        if args.filename_to_title == 'rename':
                            os.rename(archive.zipped.path, os.path.join(
                                self.settings.MEDIA_ROOT, current_path))
                            values = {'zipped': current_path,
                                      }
                            Archive.objects.add_or_update_from_values(
                                values, pk=archive.pk)
            return

        elif args.rematch_from_internal_gallery_titles:

            non_matched_archives = Archive.objects.filter(
                match_type='non-match')

            if non_matched_archives:

                archives_title_gid, galleries_title_gid = self.get_archive_and_gallery_titles()

                self.logger.info("Matching against archive and gallery database, {} archives with no match".format(non_matched_archives.count()))
                for archive in non_matched_archives:
                    adjusted_title = replace_illegal_name(
                        os.path.basename(archive.zipped.path)).replace(".zip", "")

                    galleries_id_token = get_closer_gallery_title_from_list(
                        adjusted_title, galleries_title_gid, args.rematch_from_internal_gallery_titles)
                    if galleries_id_token is not None:
                        self.logger.info("Path: {}\nGal title: {}".format(adjusted_title, galleries_id_token[0]))
                        values = {
                            'title': Gallery.objects.filter(id=galleries_id_token[1])[0].title,
                            'title_jpn': Gallery.objects.filter(id=galleries_id_token[1])[0].title_jpn,
                            'zipped': archive.zipped.path,
                            'crc32': archive.crc32,
                            'match_type': 'gallery_database',
                            'filesize': archive.filesize,
                            'filecount': archive.filecount,
                            'gallery_id': galleries_id_token[1]
                        }
                        Archive.objects.add_or_update_from_values(
                            values, pk=archive.pk)
                        Gallery.objects.update_by_dl_type(
                            {"dl_type": "folder:filename"}, galleries_id_token[1], "failed")
                    else:
                        galleries_id_token = get_closer_gallery_title_from_list(
                            adjusted_title, archives_title_gid, args.rematch_from_internal_gallery_titles)
                        if galleries_id_token is not None:
                            self.logger.info("Path: {}\nMatch title: {}".format(adjusted_title, galleries_id_token[0]))
                            values = {
                                'title': Gallery.objects.filter(id=galleries_id_token[1])[0].title,
                                'title_jpn': Gallery.objects.filter(id=galleries_id_token[1])[0].title_jpn,
                                'zipped': archive.zipped.path,
                                'crc32': archive.crc32,
                                'match_type': archive.match_type,
                                'filesize': archive.filesize,
                                'filecount': archive.filecount,
                                'gallery_id': galleries_id_token[1]
                            }
                            Archive.objects.add_or_update_from_values(
                                values, pk=archive.pk)

            return

        elif args.display_match_from_internal_gallery_titles:

            non_matched_archives = Archive.objects.filter(
                match_type='non-match')

            if non_matched_archives:

                archives_title_gid, galleries_title_gid = self.get_archive_and_gallery_titles()

                self.logger.info("Matching against archive and gallery database, {} archives with no match".format(non_matched_archives.count()))
                for archive in non_matched_archives:
                    adjusted_title = replace_illegal_name(
                        os.path.basename(archive.zipped.path)).replace(".zip", "")
                    galleries_id_token = get_closer_gallery_title_from_list(
                        adjusted_title, galleries_title_gid, args.display_match_from_internal_gallery_titles)
                    if galleries_id_token is not None:
                        self.logger.info("Path: {}\nGal title: {}".format(adjusted_title, galleries_id_token[0]))
                    else:
                        galleries_id_token = get_closer_gallery_title_from_list(
                            adjusted_title, archives_title_gid, args.display_match_from_internal_gallery_titles)
                        if galleries_id_token is not None:
                            self.logger.info("Path: {}\nMatch title: {}".format(adjusted_title, galleries_id_token[0]))

            return
        else:
            for folder in args.folder:
                p = os.path.normpath(os.path.join(self.settings.MEDIA_ROOT, folder))
                if not p.startswith(self.settings.MEDIA_ROOT):
                    continue
                folder = os.path.relpath(p, self.settings.MEDIA_ROOT).replace("\\", "/")
                if os.path.isdir(os.path.join(self.settings.MEDIA_ROOT, folder)):
                    for root, _, filenames in os.walk(os.path.join(self.settings.MEDIA_ROOT, str(folder))):
                        for filename in fnmatch.filter(filenames, self.settings.filename_filter):
                            files.append(
                                os.path.relpath(os.path.join(root, filename), self.settings.MEDIA_ROOT))
                elif os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, folder)):
                    files.append(folder)

        if args.rename_to_title:
            self.logger.info("Checking {} galleries".format(len(files)))
            for cnt, filepath in enumerate(files):

                archive = Archive.objects.filter(zipped=filepath).first()

                if archive:
                    current_path = os.path.join(
                        os.path.dirname(filepath), replace_illegal_name(archive.title) + '.zip')

                    if filepath != current_path and not os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, current_path)):
                        self.logger.info("Filename should be {} but it's {}".format(current_path, filepath))
                        if args.rename_to_title == 'rename':
                            os.rename(os.path.join(self.settings.MEDIA_ROOT, filepath), os.path.join(
                                self.settings.MEDIA_ROOT, current_path))
                            values = {'zipped': current_path,
                                      }
                            Archive.objects.add_or_update_from_values(
                                values, zipped=filepath)
            return

        if args.set_reason:
            self.settings.archive_reason = args.set_reason

        if args.set_source:
            self.settings.archive_source = args.set_source

        # The creation of the files list ends here. From here onwards, it's processing them.

        if len(files) == 0:
            self.logger.info("No file matching needed, skipping matchers")
        else:
            self.logger.info("Starting checks for {} archives".format(len(files)))

            matchers_list = self.settings.provider_context.get_matchers(self.settings, logger=self.logger)
            for matcher in matchers_list:
                self.logger.info("Using matcher {} with a priority of {}".format(matcher[0].name, matcher[1]))

            for cnt, filepath in enumerate(files):

                self.logger.info("Checking file: {} of {}, path: {}".format((cnt + 1), len(files), filepath))

                title = re.sub(
                    '[_]', ' ', os.path.splitext(os.path.basename(filepath))[0])
                archive = Archive.objects.filter(zipped=filepath).first()
                if not self.settings.rehash_files and archive:
                    crc32 = archive.crc32
                else:
                    crc32 = calc_crc32(
                        os.path.join(self.settings.MEDIA_ROOT, filepath))

                if archive:
                    if args.force_rematch:
                        self.logger.info("Doing a forced rematch")
                    elif archive.match_type in self.settings.rematch_file_list or args.rematch_wrong_filesize:
                        if self.settings.rematch_file:
                            self.logger.info("File was already matched before, but rematch is ordered")
                        else:
                            self.logger.info("File was already matched before, not rematching")
                            continue
                    else:
                        self.logger.info("Match already saved, skipping")
                        continue
                else:
                    # Test for corrupt files
                    except_at_open = False
                    return_error = None
                    try:
                        my_zip = ZipFile(
                            os.path.join(self.settings.MEDIA_ROOT, filepath), 'r')
                        return_error = my_zip.testzip()
                        my_zip.close()
                    except (BadZipFile, NotImplementedError):
                        except_at_open = True
                    if except_at_open or return_error:
                        self.logger.warning("File check on zipfile failed on file: {}, marking as corrupt.".format(filepath))
                        values = {
                            'title': title,
                            'title_jpn': '',
                            'zipped': filepath,
                            'crc32': crc32,
                            'match_type': 'corrupt',
                            'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                            'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                            'source_type': 'folder'
                        }
                        if self.settings.archive_reason:
                            values.update({'reason': self.settings.archive_reason})
                        if self.settings.archive_details:
                            values.update({'details': self.settings.archive_details})
                        if self.settings.archive_source:
                            values.update({'source_type': self.settings.archive_source})
                        Archive.objects.update_or_create_by_values_and_gid(
                            values, None, zipped=filepath)
                        continue

                    # Look for previous matches
                    archive = Archive.objects.filter(crc32=crc32).first()
                    if archive:
                        if self.settings.copy_match_file:
                            self.logger.info("Found previous match by CRC32, copying its values")
                            values = {
                                'title': archive.title,
                                'title_jpn': archive.title_jpn,
                                'zipped': filepath,
                                'crc32': crc32,
                                'match_type': archive.match_type,
                                'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                                'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                                'gallery_id': archive.gallery_id,
                                'source_type': archive.source_type
                            }
                            if self.settings.archive_reason:
                                values.update({'reason': self.settings.archive_reason})
                            if self.settings.archive_details:
                                values.update({'details': self.settings.archive_details})
                            if self.settings.archive_source:
                                values.update({'source_type': self.settings.archive_source})
                            Archive.objects.add_or_update_from_values(
                                values, zipped=filepath)
                            continue
                        else:
                            self.logger.info("Matching independently and ignoring previous match")

                match_result = False

                start_time = time.perf_counter()

                match_type = ''
                match_title = ''
                match_link = ''
                match_count = 0

                for i, matcher in enumerate(matchers_list):
                    if i > 0:
                        time.sleep(self.settings.wait_timer)
                    self.logger.info("Matching with: {}".format(matcher[0]))
                    if matcher[0].start_match(filepath, crc32):
                        match_type = matcher[0].found_by
                        match_title = matcher[0].match_title or ''
                        match_link = matcher[0].match_link or ''
                        match_count = matcher[0].match_count
                        match_result = True
                        break

                end_time = time.perf_counter()

                self.logger.info("Time taken to match file {}: {:.2f} seconds.".format(filepath, (end_time - start_time)))

                if not match_result and not do_not_replace:
                    self.logger.info('Could not match with any matcher, adding as non-match.')

                    values = {
                        'title': title,
                        'title_jpn': '',
                        'zipped': filepath,
                        'crc32': crc32,
                        'match_type': 'non-match',
                        'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                        'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)),
                        'source_type': 'folder'
                    }
                    if self.settings.archive_reason:
                        values.update({'reason': self.settings.archive_reason})
                    if self.settings.archive_details:
                        values.update({'details': self.settings.archive_details})
                    if self.settings.archive_source:
                        values.update({'source_type': self.settings.archive_source})
                    archive = Archive.objects.update_or_create_by_values_and_gid(
                        values, None, zipped=filepath)

                    if self.settings.internal_matches_for_non_matches:
                        self.logger.info('Generating possible internal matches.')

                        archive.generate_possible_matches(cutoff=0.4, clear_title=True)
                        self.logger.info('Generated matches for {}, found {}'.format(
                            archive.zipped.path,
                            archive.possible_matches.count()
                        ))
                elif match_result:
                    result_message = (
                        "Matched title: {}\n"
                        "Matched link: {}\n"
                        "Matched type: {}\n"
                        "Match count: {}\n".format(match_title, match_link, match_type, match_count)
                    )
                    self.logger.info(result_message)

        self.logger.info('Folder crawler done.')
Example #12
0
    def compare_by_hash(self, zip_path: str) -> bool:

        if not os.path.isfile(zip_path):
            return False

        crc32 = calc_crc32(zip_path)

        api_url = urljoin(self.own_settings.url, constants.api_path)
        logger.info("Querying URL: {}".format(api_url))

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = {'match': True, 'crc32': crc32}

        response = request_with_retries(
            api_url,
            request_dict,
            post=False, retries=3
        )

        if not response:
            logger.info("Got no response from server")
            return False

        response_data = response.json()

        matches_links = set()

        if 'error' in response_data:
            logger.info("Got error from server: {}".format(response_data['error']))
            return False

        for gallery in response_data:
            if 'link' in gallery:
                matches_links.add(gallery['link'])
                if 'gallery_container' in gallery and gallery['gallery_container']:
                    if self.settings.gallery_model:
                        gallery_container = self.settings.gallery_model.objects.filter(
                            gid=gallery['gallery_container'], provider=gallery['provider']
                        )
                        first_gallery = gallery_container.first()
                        if first_gallery:
                            gallery['gallery_container_gid'] = first_gallery.gid
                if 'magazine' in gallery and gallery['magazine']:
                    if self.settings.gallery_model:
                        magazine = self.settings.gallery_model.objects.filter(
                            gid=gallery['magazine'], provider=gallery['provider']
                        )
                        first_magazine = magazine.first()
                        if first_magazine:
                            gallery['magazine_gid'] = first_magazine.gid
                if 'posted' in gallery:
                    if gallery['posted'] != 0:
                        gallery['posted'] = datetime.fromtimestamp(int(gallery['posted']), timezone.utc)
                    else:
                        gallery['posted'] = None
                self.values_array.append(GalleryData(**gallery))

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False