Python request_with_retries Examples, core.base.utilities.request_with_retries Python Examples

Example #1

0

Show file

File: matchers.py Project: powered-by-moe/pandachaika

    def compare_by_title(self, title: str) -> bool:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        request_dict['params'] = {'type': 'title', 'search_value': title}

        r = request_with_retries(
            urljoin(constants.main_url, 'Search'),
            request_dict,
        )

        if not r:
            logger.info("Got no response from server")
            return False

        r.encoding = 'utf-8'
        soup_1 = BeautifulSoup(r.text, 'html.parser')

        matches_links = set()

        for gallery in soup_1.find_all(
                "div", class_=re.compile("showcase_comics_product_image_box")):
            link_container = gallery.find("a")
            if link_container:
                matches_links.add(
                    urljoin(constants.main_url, link_container['href']))

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False

Example #2

0

Show file

File: matchers.py Project: powered-by-moe/pandachaika

    def compare_by_title(self, title: str) -> bool:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        r = request_with_retries(
            urljoin(constants.main_url, 'search/') + quote(title),
            request_dict,
        )

        if not r:
            logger.info("Got no response from server")
            return False

        r.encoding = 'utf-8'
        soup_1 = BeautifulSoup(r.text, 'html.parser')

        matches_links = set()

        # content-row manga row
        for gallery in soup_1.find_all("div", class_=re.compile("content-row")):
            link_container = gallery.find("a", class_="content-title")
            if link_container:
                matches_links.add(urljoin(constants.main_url, link_container['href']))

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False

Example #3

0

Show file

File: parsers.py Project: powered-by-moe/pandachaika

    def crawl_feed(self, feed_url: str = '') -> list[str]:

        urls: list[str] = []

        if not feed_url:
            feed_url = constants.rss_url

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            feed_url,
            request_dict,
            post=False,
        )

        if not response:
            logger.error("Got no response from feed URL: {}".format(feed_url))
            return urls

        response.encoding = 'utf-8'

        feed = feedparser.parse(
            response.text
        )

        for item in feed['items']:
            if any([item['title'].startswith(category) for category in self.own_settings.accepted_rss_categories]):
                urls.append(item['link'])
        return urls

Example #4

0

Show file

File: matchers.py Project: powered-by-moe/pandachaika

    def compare_by_title_search_page(self, title: str) -> bool:

        # https://www.fakku.net/search/+title
        full_url = urljoin(constants.main_url, 'search/+') + quote(title)

        logger.info("Querying URL: {}".format(full_url))

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            full_url,
            request_dict,
            post=False, retries=3
        )

        if not response:
            logger.info("Got no response from server")
            return False

        response.encoding = 'utf-8'
        soup_1 = BeautifulSoup(response.text, 'html.parser')

        matches_links = set()

        for link_container in soup_1.find_all("div", class_=re.compile("content-meta")):
            title_container = link_container.find("a", class_="content-title")
            matches_links.add(urljoin(constants.main_url, title_container.get('href')))

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False

Example #5

0

Show file

File: matchers.py Project: powered-by-moe/pandachaika

    def compare_by_title(self, title: str) -> bool:

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = {'q': title}

        response = request_with_retries(
            constants.main_page,
            request_dict,
            post=False,
        )

        if not response:
            return False
        response.encoding = 'utf-8'

        m = re.finditer(r'a href="/view/(\d+)/*"', response.text)

        matches_links = set()

        if m:
            for match in m:
                matches_links.add("{}{}".format(
                    constants.gallery_container_url, match.group(1)))

        self.gallery_links = list(matches_links)

        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False

Example #6

0

Show file

File: parsers.py Project: Kadantte/pandachaika

    def get_galleries_from_page_links(self, page_links: Iterable[str], page_links_results: List[DataDict]) -> None:

        api_page_links = []

        for page_link in page_links:

            m = re.search(r'(.+)/s/(\w+)/(\d+)-(\d+)', page_link)
            if not m:
                continue
            api_page_links.append(
                {'data': [m.group(3), m.group(2), m.group(4)]})

        api_page_links_chunks = list(chunks(api_page_links, 25))

        for i, group in enumerate(api_page_links_chunks):

            if i % 3 == 2:
                time.sleep(self.settings.wait_timer)

            data = {
                'method': 'gtoken',
                'pagelist': [x['data'] for x in group]}

            headers = {'Content-Type': 'application/json'}

            response = request_with_retries(
                constants.ge_api_url,
                {
                    'data': json.dumps(data),
                    'headers': {**headers, **self.settings.requests_headers},
                    'timeout': self.settings.timeout_timer
                },
                post=True,
                logger=self.logger
            )

            if not response:
                continue
            try:
                response_data = response.json()
            except(ValueError, KeyError):
                self.logger.error("Error parsing response to JSON: {}".format(response.text))
                continue

            for gid_token_pair in response_data['tokenlist']:

                discard_approved, discard_message = self.discard_gallery_by_internal_checks(
                    gallery_id=gid_token_pair['gid'],
                    link=link_from_gid_token_fjord(gid_token_pair['gid'], gid_token_pair['token'], False)
                )

                if discard_approved:
                    if not self.settings.silent_processing:
                        self.logger.info(discard_message)
                    continue

                page_links_results.append(
                    {'data': (gid_token_pair['gid'], gid_token_pair['token']),
                     'link': link_from_gid_token_fjord(gid_token_pair['gid'], gid_token_pair['token'], False)})

Example #7

0

Show file

File: parsers.py Project: powered-by-moe/pandachaika

    def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            link,
            request_dict,
            post=False,
        )

        if not response:
            return None

        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')

        if soup:
            title_jpn_match = soup.find("div", id=re.compile("info")).h2

            gallery_id_match = re.search(
                r'{}(\d+)'.format(constants.gallery_container_url), link)

            if not gallery_id_match:
                return None
            gallery_id = 'nh-' + gallery_id_match.group(1)

            gallery = GalleryData(gallery_id, self.name)
            gallery.title = soup.h1.get_text()
            gallery.title_jpn = title_jpn_match.get_text(
            ) if title_jpn_match else ''
            gallery_filecount_match = re.search(r'<div>(\d+) page(s*)</div>',
                                                response.text)
            if gallery_filecount_match:
                gallery.filecount = int(gallery_filecount_match.group(1))
            else:
                gallery.filecount = 0
            gallery.tags = []
            gallery.link = link
            gallery.posted = dateutil.parser.parse(
                soup.find("time")['datetime'])

            for tag_container in soup.find_all("a", {"class": "tag"}):
                tag_name = [text for text in tag_container.stripped_strings][0]
                tag_name = tag_name.split(" | ")[0]
                tag_scope = tag_container.parent.parent.get_text()
                tag_ext = tag_container.parent.get_text()
                tag_scope = tag_scope.replace(tag_ext, "").replace(
                    "\t", "").replace("\n", "").replace(":", "").lower()
                if tag_scope == 'tags':
                    gallery.tags.append(translate_tag(tag_name))
                elif tag_scope == 'categories':
                    gallery.category = tag_name.capitalize()
                else:
                    gallery.tags.append(
                        translate_tag(tag_scope + ":" + tag_name))

        else:
            return None
        return gallery

Example #8

0

Show file

    def compare_by_title(self, gallery_title: str) -> bool:

        api_url = urljoin(self.own_settings.metadata_url, constants.api_path)
        logger.info("Querying URL: {}".format(api_url))

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = {'match': True, 'title': gallery_title}

        response = request_with_retries(
            api_url,
            request_dict,
            post=False, retries=3
        )

        if not response:
            logger.info("Got no response from server")
            return False

        response_data = response.json()

        matches_links = set()

        if 'error' in response_data:
            logger.info("Got error from server: {}".format(response_data['error']))
            return False

        for gallery in response_data:
            if 'link' in gallery:
                matches_links.add(gallery['link'])
                if 'gallery_container' in gallery and gallery['gallery_container']:
                    if self.settings.gallery_model:
                        gallery_container = self.settings.gallery_model.objects.filter(
                            gid=gallery['gallery_container'], provider=gallery['provider']
                        )
                        first_gallery_container = gallery_container.first()
                        if first_gallery_container:
                            gallery['gallery_container_gid'] = first_gallery_container.gid
                if 'magazine' in gallery and gallery['magazine']:
                    if self.settings.gallery_model:
                        magazine = self.settings.gallery_model.objects.filter(
                            gid=gallery['magazine'], provider=gallery['provider']
                        )
                        first_magazine = magazine.first()
                        if first_magazine:
                            gallery['magazine_gid'] = first_magazine.gid
                if 'posted' in gallery:
                    if gallery['posted'] != 0:
                        gallery['posted'] = datetime.fromtimestamp(int(gallery['posted']), timezone.utc)
                    else:
                        gallery['posted'] = None
                self.values_array.append(GalleryData(**gallery))

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False

Example #9

0

Show file

File: matchers.py Project: powered-by-moe/pandachaika

    def search_using_xml_api(self, title: str) -> bool:

        if not self.own_settings.api_key:
            logger.error("Can't use {} API without an api key. Check {}/API_MANUAL.txt".format(
                self.name,
                constants.main_page
            ))
            return False

        page = 1
        galleries = []

        while True:
            link = '{}/api/{}/?S=objectSearch&sn={}&page={}'.format(
                constants.main_page,
                self.own_settings.api_key,
                title,
                page
            )

            request_dict = construct_request_dict(self.settings, self.own_settings)

            response = request_with_retries(
                link,
                request_dict,
                post=False,
            )

            if not response:
                break

            response.encoding = 'utf-8'
            # Based on: https://www.doujinshi.org/API_MANUAL.txt

            api_galleries = convert_api_response_text_to_gallery_dicts(response.text)

            if not api_galleries:
                break

            galleries.extend(api_galleries)

            # API returns 25 max results per query, so if we get 24 or less, means there's no more pages.
            # API Manual says 25, but we get 50 results normally!
            if len(api_galleries) < 50:
                break

            page += 1

        self.values_array = galleries

        self.gallery_links = [x.link for x in galleries if x.link]
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False

Example #10

0

Show file

def get_image_link_from_tweet_text(
        tweet_text: str, settings: 'Settings') -> typing.Optional[str]:
    tweet_links = re.findall(r"https://t.co/\w+", tweet_text)
    for tweet_link in tweet_links:
        request_dict = {
            'timeout': settings.timeout_timer,
            'allow_redirects': False
        }
        r = request_with_retries(
            tweet_link,
            request_dict,
            post=False,
        )
        if not r:
            return None
        if 'Location' in r.headers:
            if r.headers['Location'].startswith(
                    'https://www.wani.com/product/'):
                request_dict_image = {
                    'headers': settings.requests_headers,
                    'timeout': settings.timeout_timer,
                }
                product_page = request_with_retries(
                    r.headers['Location'],
                    request_dict_image,
                    post=False,
                )
                if not product_page:
                    return None
                product_page.encoding = 'utf-8'
                soup = BeautifulSoup(product_page.text, 'html.parser')
                product_head = soup.find("head")
                if product_head:
                    img_container = product_head.find("meta",
                                                      property="og:image")
                    if img_container:
                        return img_container['content']

    return None

Example #11

0

Show file

    def get_galleries_from_xml(self,
                               url_group: Iterable[str]) -> list[GalleryData]:

        possible_gallery_ids = [
            self.id_from_url(gallery_url) for gallery_url in url_group
        ]

        galleries_ids = [
            gallery_id.replace('mugi-B', 'B')
            for gallery_id in possible_gallery_ids if gallery_id
        ]

        galleries = list()

        gallery_chunks = list(chunks(galleries_ids, 25))

        for i, group in enumerate(gallery_chunks):
            logger.info(
                "Calling API ({}). Gallery group: {}, galleries in group: {}, total groups: {}"
                .format(self.name, i + 1, len(group), len(gallery_chunks)))

            # API doesn't say anything about needing to wait between requests, but we wait just in case.
            if i > 0:
                time.sleep(self.own_settings.wait_timer)

            link = constants.main_page + '/api/' + self.own_settings.api_key + '/?S=getID&ID=' + ",".join(
                galleries_ids)

            request_dict = construct_request_dict(self.settings,
                                                  self.own_settings)

            response = request_with_retries(
                link,
                request_dict,
                post=False,
            )

            if not response:
                continue

            response.encoding = 'utf-8'
            api_galleries = convert_api_response_text_to_gallery_dicts(
                response.text)

            if not api_galleries:
                continue
            galleries.extend(api_galleries)

        return galleries

Example #12

0

Show file

    def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            link,
            request_dict,
            post=False,
        )

        if not response:
            return None

        response.encoding = 'utf-8'

        return self.process_regular_gallery_page(link, response.text)

Example #13

0

Show file

File: matchers.py Project: Kadantte/pandachaika

    def compare_by_title_json(self, title: str) -> bool:

        # https://www.fakku.net/suggest/return%20of%20me
        headers = {
            'Content-Type': 'application/json',
            'Referer': constants.main_url + '/',
            'X-Requested-With': 'XMLHttpRequest',
        }

        self.logger.info("Querying URL: {}".format(
            urljoin(constants.main_url, 'suggest/') + quote(title.lower())))

        response = request_with_retries(
            urljoin(constants.main_url, 'suggest/') + quote(title.lower()), {
                'headers': {
                    **headers,
                    **self.settings.requests_headers
                },
                'timeout': self.settings.timeout_timer
            },
            post=False,
            retries=3,
            logger=self.logger)

        if not response:
            self.logger.info("Got no response from server")
            return False

        response_data = response.json()

        matches_links = set()

        if 'error' in response_data:
            self.logger.info("Got error from server: {}".format(
                response_data['error']))
            return False

        for gallery in response_data:
            if gallery['type'] in ('doujinshi', 'manga', 'hentai'):
                matches_links.add(urljoin(constants.main_url, gallery['link']))

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False

Example #14

0

Show file

File: parsers.py Project: 5aimiku/pandachaika

    def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]:

        fjord, gid, token = fjord_gid_token_from_link(link)

        if fjord is None or gid is None or token is None:
            return None

        if fjord:
            api_page = constants.ex_api_url
        else:
            api_page = constants.ge_api_url

        data = utilities.request_data_from_gid_token_iterable([(gid, token)])

        headers = {'Content-Type': 'application/json'}

        response = request_with_retries(api_page, {
            'data': json.dumps(data),
            'headers': {
                **headers,
                **self.settings.requests_headers
            },
            'cookies': self.own_settings.cookies,
            'timeout': self.settings.timeout_timer
        },
                                        post=True,
                                        logger=self.logger)

        if not response:
            return None
        try:
            response_data = response.json()
        except (ValueError, KeyError):
            self.logger.error("Error parsing response to JSON: {}".format(
                response.text))
            return None
        for gallery_data in response_data['gmetadata']:
            if 'error' in gallery_data:
                self.logger.error("Adding gallery {}: "
                                  "failed with error: {}".format(
                                      gallery_data['gid'],
                                      gallery_data['error']))
                return None
            internal_gallery_data = map_external_gallery_data_to_internal(
                gallery_data)
            return internal_gallery_data
        return None

Example #15

0

Show file

File: downloaders.py Project: powered-by-moe/pandachaika

    def start_download(self) -> None:

        if not self.gallery or not self.gallery.temp_archive:
            return

        logger.info(
            "Downloading an archive: {} from a Panda Backup-like source: {}".
            format(self.gallery.title, self.gallery.temp_archive['link']))

        to_use_filename = get_base_filename_string_from_gallery_data(
            self.gallery)

        to_use_filename = replace_illegal_name(to_use_filename)

        self.gallery.filename = available_filename(
            self.settings.MEDIA_ROOT,
            os.path.join(self.own_settings.archive_dl_folder,
                         to_use_filename + '.zip'))  # TODO: File could be cbz.

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['stream'] = True
        request_file = request_with_retries(
            self.gallery.temp_archive['link'],
            request_dict,
        )
        if not request_file:
            logger.error("Could not download archive")
            self.return_code = 0
            return
        filepath = os.path.join(self.settings.MEDIA_ROOT,
                                self.gallery.filename)

        with open(filepath, 'wb') as fo:
            for chunk in request_file.iter_content(4096):
                fo.write(chunk)

        self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(
            filepath)
        if self.gallery.filesize > 0:
            self.crc32 = calc_crc32(filepath)

            self.fileDownloaded = 1
            self.return_code = 1

        else:
            logger.error("Could not download archive")
            self.return_code = 0

Example #16

0

Show file

    def request_torrent_download(
            self, root: str, gid: str,
            token: str) -> Optional[requests.models.Response]:

        url = root + '/gallerytorrents.php'

        params = {'gid': gid, 't': token}

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = params

        response = request_with_retries(
            url,
            request_dict,
            post=True,
        )

        return response

Example #17

0

Show file

File: downloaders.py Project: 5aimiku/pandachaika

    def request_torrent_download(
            self, root: str, gid: str,
            token: str) -> Optional[requests.models.Response]:

        url = root + '/gallerytorrents.php'

        params = {'gid': gid, 't': token}

        response = request_with_retries(url, {
            'params': params,
            'cookies': self.own_settings.cookies,
            'headers': self.settings.requests_headers,
            'timeout': self.settings.timeout_timer
        },
                                        post=True,
                                        logger=self.logger)

        return response

Example #18

0

Show file

File: parsers.py Project: powered-by-moe/pandachaika

    def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]:

        link_root, gid, token = root_gid_token_from_link(link)

        if link_root is None or gid is None or token is None:
            return None

        if self.own_settings.use_ex_for_fjord and self.own_settings.cookies and link_root == constants.ex_api_url:
            api_page = constants.ex_api_url
        else:
            api_page = constants.ge_api_url

        data = utilities.request_data_from_gid_token_iterable([(gid, token)])

        headers = {'Content-Type': 'application/json'}

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['headers'] = {**headers, **self.settings.requests_headers}
        request_dict['data'] = json.dumps(data)

        response = request_with_retries(
            api_page,
            request_dict,
            post=True,
        )

        if not response:
            return None
        try:
            response_data = response.json()
        except(ValueError, KeyError):
            logger.error("Could not parse response to JSON: {}".format(response.text))
            return None
        for gallery_data in response_data['gmetadata']:
            if 'error' in gallery_data:
                logger.error(
                    "Adding gallery {}: "
                    "failed with error: {}".format(gallery_data['gid'], gallery_data['error'])
                )
                return None
            internal_gallery_data = map_external_gallery_data_to_internal(gallery_data)
            return internal_gallery_data
        return None

Example #19

0

Show file

    def request_archive_download(
            self, root: str, gid: str, token: str,
            key: str) -> Optional[requests.models.Response]:

        url = root + '/archiver.php'

        params = {'gid': gid, 'token': token, 'or': key}

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = params
        request_dict['data'] = constants.archive_download_data

        response = request_with_retries(
            url,
            request_dict,
            post=True,
        )

        return response

Example #20

0

Show file

File: downloaders.py Project: 5aimiku/pandachaika

    def request_archive_download(
            self, root: str, gid: str, token: str,
            key: str) -> Optional[requests.models.Response]:

        url = root + '/archiver.php'

        params = {'gid': gid, 'token': token, 'or': key}

        response = request_with_retries(url, {
            'params': params,
            'cookies': self.own_settings.cookies,
            'data': constants.archive_download_data,
            'headers': self.settings.requests_headers,
            'timeout': self.settings.timeout_timer
        },
                                        post=True,
                                        logger=self.logger)

        return response

Example #21

0

Show file

    def crawl_feed(self, feed_url: str = None) -> List[str]:

        urls: List[str] = []

        if not feed_url:
            feed_url = constants.rss_url

        response = request_with_retries(feed_url, {
            'headers': self.settings.requests_headers,
            'timeout': self.settings.timeout_timer,
        },
                                        post=False,
                                        logger=self.logger)

        if not response:
            self.logger.error(
                "No response from URL: {}, returning".format(feed_url))
            return urls

        response.encoding = 'utf-8'

        match_string = re.compile(r"/view/(\d+)/*$")

        soup = BeautifulSoup(response.text, 'html.parser')

        content_container = soup.find("div", class_="columns")

        if not content_container:
            self.logger.error("Content container not found, returning")
            return urls

        url_containers = content_container.find_all("a", href=match_string)

        for url_container in url_containers:

            url_link = url_container.get('href')

            complete_url = "{}{}".format(constants.main_page, url_link)

            urls.append(complete_url)

        return urls

Example #22

0

Show file

    def compare_by_title(self, title: str) -> bool:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        request_dict['params'] = {'q': title}

        r = request_with_retries(
            urljoin(constants.main_url, 'search/'),
            request_dict,
        )

        if not r:
            logger.info("Got no response from server")
            return False

        r.encoding = 'utf-8'
        soup_1 = BeautifulSoup(r.text, 'html.parser')

        matches_links = set()

        discard_titles = ["(SPANISH)", "(FRENCH)"]

        for link_container in soup_1.find_all(
                "a", class_=re.compile("product-card")):
            # Discard spanish/french releases
            title_container = link_container.find("div",
                                                  class_="product-card__name")
            if any(x for x in discard_titles
                   if title_container.get_text().startswith(x)):
                continue
            matches_links.add(
                urljoin(
                    constants.main_url,
                    urljoin(link_container['href'],
                            urlparse(link_container['href']).path)))

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False

Example #23

0

Show file

    def crawl_feed(self, feed_url: str = '') -> list[ChaikaGalleryData]:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            feed_url,
            request_dict,
            post=False,
        )

        dict_list = []

        if not response:
            return []

        try:
            json_decoded = response.json()
        except (ValueError, KeyError):
            logger.error("Could not parse response to JSON: {}".format(
                response.text))
            return []

        if type(json_decoded) == dict:
            if 'galleries' in json_decoded:
                dict_list = json_decoded['galleries']
            else:
                dict_list.append(json_decoded)
        elif type(json_decoded) == list:
            dict_list = json_decoded

        total_galleries_filtered: list[ChaikaGalleryData] = []

        for gallery in dict_list:
            if 'result' in gallery:
                continue
            gallery['posted'] = datetime.fromtimestamp(int(gallery['posted']),
                                                       timezone.utc)
            gallery_data = ChaikaGalleryData(**gallery)
            total_galleries_filtered.append(gallery_data)

        return total_galleries_filtered

Example #24

0

Show file

    def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            link,
            request_dict,
            post=False,
        )

        if not response:
            return None

        response.encoding = 'utf-8'
        new_text = re.sub(r'(<div class="right">\d+?)</b>', r'\1',
                          response.text)

        if constants.main_url + '/magazines/' in link:
            return self.process_magazine_page(link, new_text)
        else:
            return self.process_regular_gallery_page(link, new_text)

Example #25

0

Show file

    def parse_posted_date_from_feed(self, link: str,
                                    gid: str) -> Optional[datetime]:
        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            link,
            request_dict,
            post=False,
        )

        if not response:
            return None

        response.encoding = 'utf-8'

        feed = feedparser.parse(response.text)

        for item in feed['items']:
            if gid in item['id']:
                return date_parser.parse(
                    item['published'],
                    tzinfos=constants.extra_feed_url_timezone)
        return None

Example #26

0

Show file

File: matchers.py Project: Kadantte/pandachaika

    def compare_by_title(self, title: str) -> bool:

        headers = {'Content-Type': 'application/json'}

        api_link = constants.posts_api_url
        payload = {'search': title}

        response = request_with_retries(api_link, {
            'headers': {
                **headers,
                **self.settings.requests_headers
            },
            'timeout': self.settings.timeout_timer,
            'params': payload
        },
                                        post=False,
                                        logger=self.logger)

        if not response:
            return False
        response.encoding = 'utf-8'
        try:
            response_data = response.json()
        except (ValueError, KeyError):
            return False

        matches_links = set()

        for gallery in response_data:
            matches_links.add(gallery['link'])

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False

Example #27

0

Show file

File: parsers.py Project: powered-by-moe/pandachaika

    def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            link,
            request_dict,
            post=False,
        )

        if not response:
            return None

        response.encoding = 'utf-8'

        match_string = re.compile(constants.main_page + '/(.+)/$')

        tags = []

        soup = BeautifulSoup(response.text, 'html.parser')

        content_container = soup.find("div", class_="content")

        if not content_container:
            return None

        artists_container = content_container.find_all(
            "a", href=re.compile(constants.main_page + '/artist/.*/$'))

        for artist in artists_container:
            tags.append("artist:{}".format(artist.get_text()))

        tags_container = content_container.find_all(
            "a", href=re.compile(constants.main_page + '/tag/.*/$'))

        for tag in tags_container:
            tags.append(tag.get_text())

        # thumbnail_small_container = content_container.find("img")
        # if thumbnail_small_container:
        #     thumbnail_url = thumbnail_small_container.get('src')
        thumbnail_url = soup.find("meta", property="og:image")

        match_result = match_string.match(soup.find("meta",
                                                    property="og:link"))
        if not match_result:
            return None

        gallery = GalleryData(
            match_result.group(1),
            self.name,
            link=link,
            title=soup.find("meta", property="og:title"),
            comment='',
            thumbnail_url=thumbnail_url,
            category='Manga',
            uploader='',
            posted=None,
            filecount=0,
            filesize=0,
            expunged=False,
            rating='',
            tags=translate_tag_list(tags),
            content=content_container.encode_contents(),
        )

        return gallery

Example #28

0

Show file

def wanted_generator(settings: 'Settings', attrs: 'AttributeManager'):
    own_settings = settings.providers[constants.provider_name]

    if not own_settings.api_key:
        logger.error(
            "Can't use {} API without an api key. Check {}/API_MANUAL.txt".
            format(constants.provider_name, constants.main_page))
        return False

    queries: DataDict = {}
    queries_slist_params: DataDict = {}

    for attr in attrs.filter(name__startswith='wanted_params_'):

        attr_info = attr.name.replace('wanted_params_', '')
        query_name, attr_name = attr_info.split("_", maxsplit=1)

        if query_name not in queries:
            queries[query_name] = {
                'page': 1,
                'S': 'objectSearch',
                'match': 0,
                'order': 'added',
                'flow': 'DESC'
            }

        if attr_name.startswith('slist_'):
            if query_name not in queries_slist_params:
                queries_slist_params[query_name] = []
            queries_slist_params[query_name].append('{}:{}'.format(
                attr_name.replace('slist_', ''), attr.value))
        else:
            queries[query_name].update({attr_name: attr.value})

    for query_name, slist_params in queries_slist_params.items():
        queries[query_name].update({'slist': '|'.join(slist_params)})

    for query_name, query_values in queries.items():

        while True:
            # Read the values from the newly created Provider Model,
            # that should be created like this (extracted from from):
            # wanted_params_match: Any, Sounds Like, Start With, End With, Exact -> 0, 4, 1, 2, 3
            # wanted_params_age: 18+ -> blank/Y/N
            # wanted_params_anth: Anthology -> blank/Y/N
            # wanted_params_bcopy: Copybook -> blank/Y/N
            # wanted_params_FREE: Free -> blank/Y/N
            # wanted_params_flist: Type ->
            # blank: Any
            # 19: Bootleg
            # 18: Calendar
            # 12: Commercial Artbook
            # 8: Commercial CG
            # 7: Commercial Magazine
            # 25: Commercial Mook
            # 11: Commercial Novel
            # 10: Commercial other
            # 13: Commercial other book
            # 9: Commercial Soft
            # 2: Doujin CG
            # 24: Doujin Goods
            # 23: Doujin Movie
            # 22: Doujin Music
            # 21: Doujin Novel
            # 4: Doujin Other
            # 3: Doujin Soft
            # 1: Doujinshi
            # 5: Manga
            # 6: Manga (Part)
            # 17: Postcard
            # 16: Poster
            # 15: Shitajiki
            # 14: Telephone Card
            # 20: Unknown
            # wanted_params_date: Release date from -> yyyy-mm-dd
            # wanted_params_date2: Release date to -> yyyy-mm-dd
            # for slist parameters:
            # Here is the list of ALL search terms:
            # C: Circle
            # A: Author
            # P: Parody
            # H: Character
            # N: Convention
            # O: Collections
            # K: Content
            # G: Genre
            # T: Type
            # L: Publisher
            # I: Imprint
            # wanted_params_slist_C: Separated by |
            # wanted_params_slist_A: Separated by |
            # wanted_params_slist_P: Separated by |
            # wanted_params_slist_H: Separated by |
            # wanted_params_slist_K: Separated by |
            # wanted_params_slist_G: Separated by |
            # wanted_params_slist_N: One
            # wanted_params_slist_O: One
            # wanted_params_slist_L: One
            # wanted_params_slist_I: One
            # wanted_params_cont: One
            # wanted_params_sub: One
            # wanted_params_scen: Censored -> blank/Y/N

            new_query = urllib.parse.urlencode(query_values, doseq=True)

            logger.info(
                'Querying {} for auto wanted galleries, page: {}, query name: {}, query: {}'
                .format(constants.provider_name, query_values['page'],
                        query_name, new_query))

            link = '{}/api/{}/?{}'.format(constants.main_page,
                                          own_settings.api_key, new_query)

            provider, provider_created = Provider.objects.get_or_create(
                slug=constants.provider_name,
                defaults={'name': constants.provider_name})

            remaining_queries, int_created = attrs.get_or_create(
                provider=provider,
                name='remaining_queries',
                data_type='int',
                defaults={
                    'value_int': constants.daily_requests,
                })

            last_query_date, date_created = attrs.get_or_create(
                provider=provider,
                name='last_query_date',
                data_type='date',
                defaults={
                    'value_date': django_tz.now(),
                })

            if not date_created:
                limit_time = datetime.time(tzinfo=datetime.timezone(
                    datetime.timedelta(
                        hours=1)))  # GMT+1 is when server resets
                if last_query_date.value.timetz() < limit_time < django_tz.now(
                ).timetz():
                    remaining_queries.value = constants.daily_requests
                    remaining_queries.save()

            if remaining_queries.value <= 0:
                logger.warning(
                    "Daily queries quota {} reached for {}. It resets at 00:00 GMT+1"
                    .format(constants.daily_requests, constants.provider_name))
                return

            request_dict = construct_request_dict(settings, own_settings)

            response = request_with_retries(
                link,
                request_dict,
                post=False,
            )

            remaining_queries.value -= 1
            remaining_queries.save()
            last_query_date.value = django_tz.now()
            last_query_date.save()

            if not response:
                logger.error(
                    'For provider {}: Got to page {}, but did not get a response, stopping'
                    .format(constants.provider_name, query_values['page']))
                break

            response.encoding = 'utf-8'
            # Based on: https://www.doujinshi.org/API_MANUAL.txt

            api_galleries = convert_api_response_text_to_gallery_dicts(
                response.text)

            if not api_galleries:
                logger.error('Server response: {}'.format(response.text))
                logger.error(
                    'For provider {}: Got to page {}, but could not parse the response into galleries, stopping'
                    .format(constants.provider_name, query_values['page']))
                break

            # Listen to what the server says
            remaining_queries.value = api_galleries[0].queries
            remaining_queries.save()

            used = Gallery.objects.filter(
                gid__in=[x.gid for x in api_galleries],
                provider=constants.provider_name)

            # If the amount of galleries present in database is equal to what we get from the page,
            # we assume we already processed everything. You can force to process everything by using:
            force_process, force_created = attrs.get_or_create(
                provider=provider,
                name='force_process',
                data_type='bool',
                defaults={
                    'value_bool': False,
                })

            logger.info(
                'For provider {}: Page has {} galleries, from which {} are already present in the database.'
                .format(constants.provider_name, len(api_galleries),
                        used.count()))

            if not force_process.value and used.count() == len(api_galleries):
                logger.info(
                    'For provider {}: Got to page {}, it has already been processed entirely, stopping'
                    .format(constants.provider_name, query_values['page']))
                break

            used_gids = used.values_list('gid', flat=True)

            for gallery_data in api_galleries:
                if gallery_data.gid not in used_gids:
                    if not gallery_data.dl_type:
                        gallery_data.dl_type = 'auto_wanted'
                    wanted_reason = attrs.fetch_value(
                        'wanted_reason_{}'.format(query_name))
                    if isinstance(wanted_reason, str):
                        gallery_data.reason = wanted_reason or 'backup'
                    gallery = Gallery.objects.add_from_values(gallery_data)
                    # We match anyways in case there's a previous WantedGallery.
                    # Actually, we don't match since we only get metadata here, so it should not count as found.
                    publisher_name = ''
                    publisher = gallery.tags.filter(scope='publisher').first()
                    if publisher:
                        publisher_name = publisher.name

                    if not gallery.title_jpn:
                        continue

                    search_title = format_title_to_wanted_search(
                        gallery.title_jpn)

                    wanted_galleries: typing.Iterable[
                        WantedGallery] = WantedGallery.objects.filter(
                            title_jpn=gallery.title_jpn,
                            search_title=search_title)

                    if not wanted_galleries:
                        wanted_gallery = WantedGallery.objects.create(
                            title=gallery.title or gallery.title_jpn,
                            title_jpn=gallery.title_jpn,
                            search_title=search_title,
                            book_type=gallery.category,
                            page_count=gallery.filecount,
                            publisher=publisher_name,
                            add_as_hidden=True,
                            reason=attrs.fetch_value(
                                'wanted_reason_{}'.format(query_name)) or '',
                            public=attrs.fetch_value(
                                'wanted_public_{}'.format(query_name))
                            or False,
                            should_search=attrs.fetch_value(
                                'wanted_should_search_{}'.format(query_name))
                            or True,
                            keep_searching=attrs.fetch_value(
                                'wanted_keep_searching_{}'.format(query_name))
                            or True,
                            category='Manga',
                            unwanted_title=own_settings.unwanted_title
                            or settings.auto_wanted.unwanted_title)
                        wanted_provider_string = attrs.fetch_value(
                            'wanted_provider_{}'.format(query_name))
                        if wanted_provider_string and isinstance(
                                wanted_provider_string, str):
                            wanted_provider_instance = Provider.objects.filter(
                                slug=wanted_provider_string).first()
                            if wanted_provider_instance:
                                wanted_gallery.wanted_providers.add(
                                    wanted_provider_instance)
                        wanted_providers_string = attrs.fetch_value(
                            'wanted_providers_{}'.format(query_name))
                        if wanted_providers_string and isinstance(
                                wanted_providers_string, str):
                            for wanted_provider in wanted_providers_string.split(
                            ):
                                wanted_provider = wanted_provider.strip()
                                wanted_provider_instance = Provider.objects.filter(
                                    slug=wanted_provider).first()
                                if wanted_provider_instance:
                                    wanted_gallery.wanted_providers.add(
                                        wanted_provider_instance)

                        for artist in gallery.tags.filter(scope='artist'):
                            artist_obj = Artist.objects.filter(
                                name_jpn=artist.name).first()
                            if not artist_obj:
                                artist_obj = Artist.objects.create(
                                    name=artist.name, name_jpn=artist.name)
                            wanted_gallery.artists.add(artist_obj)
                        logger.info(
                            "Created wanted gallery ({}): {}, search title: {}"
                            .format(wanted_gallery.book_type,
                                    wanted_gallery.get_absolute_url(),
                                    gallery.title_jpn))

                        wanted_galleries = [wanted_gallery]

                    for wanted_gallery in wanted_galleries:

                        mention, mention_created = wanted_gallery.mentions.get_or_create(
                            mention_date=gallery.create_date,
                            release_date=gallery.posted,
                            type='release_date',
                            source=constants.provider_name,
                        )
                        if mention_created and gallery.thumbnail:
                            mention.copy_img(gallery.thumbnail.path)
                            wanted_gallery.calculate_nearest_release_date()

            # galleries.extend(api_galleries)

            # API returns 25 max results per query, so if we get 24 or less, means there's no more pages.
            # API Manual says 25, but we get 50 results normally!
            if len(api_galleries) < 50:
                logger.info(
                    'Got to page {}, and we got less than 50 galleries, '
                    'meaning there is no more pages, stopping'.format(
                        query_values['page']))
                break

            query_values['page'] += 1

    logger.info("{} Auto wanted ended.".format(constants.provider_name))

Example #29

0

Show file

File: parsers.py Project: powered-by-moe/pandachaika

    def crawl_feed(self, feed_url: str = '') -> list[GalleryData]:

        if not feed_url:
            feed_url = constants.rss_url

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            feed_url,
            request_dict,
            post=False,
        )

        if not response:
            logger.error("Got no response from feed URL: {}".format(feed_url))
            return []

        response.encoding = 'utf-8'

        feed = feedparser.parse(response.text)

        galleries = []

        match_string = re.compile(constants.main_page + '/(.+)/$')
        skip_tags = ['Uncategorized']

        logger.info(
            "Provided RSS URL for provider ({}), adding {} found links".format(
                self.name, len(feed['items'])))

        for item in feed['items']:
            tags = [x.term for x in item['tags'] if x.term not in skip_tags]

            thumbnail_url = ''

            for content in item['content']:
                soup = BeautifulSoup(content.value, 'html.parser')

                artists_container = soup.find_all(
                    "a", href=re.compile(constants.main_page + '/artist/.*/$'))

                for artist in artists_container:
                    tags.append("artist:{}".format(artist.get_text()))

                thumbnail_small_container = soup.find("img")
                if thumbnail_small_container:
                    thumbnail_url = thumbnail_small_container.get('src')

            match_result = match_string.match(item['link'])
            if not match_result:
                continue

            gallery = GalleryData(match_result.group(1),
                                  self.name,
                                  title=item['title'],
                                  comment=item['description'],
                                  thumbnail_url=thumbnail_url,
                                  category='Manga',
                                  uploader=item['author'],
                                  posted=datetime.strptime(
                                      item['published'],
                                      "%a, %d %b %Y %H:%M:%S %z"),
                                  filecount=0,
                                  filesize=0,
                                  expunged=False,
                                  rating='',
                                  tags=translate_tag_list(tags),
                                  content=item['content'][0].value,
                                  link=item['link'])

            # Must check here since this method is called after the main check in crawl_urls
            if self.general_utils.discard_by_tag_list(gallery.tags):
                continue

            if not gallery.link:
                continue

            discard_approved, discard_message = self.discard_gallery_by_internal_checks(
                gallery.gid, link=gallery.link)
            if discard_approved:
                if not self.settings.silent_processing:
                    logger.info(discard_message)
                continue

            galleries.append(gallery)

        return galleries

Example #30

0

Show file

File: parsers.py Project: powered-by-moe/pandachaika

    def get_values_from_gallery_link_json(self,
                                          link: str) -> Optional[GalleryData]:

        match_string = re.compile(constants.main_page + '/(.+)/$')

        m = match_string.match(link)

        if m:
            gallery_slug = m.group(1)
        else:
            return None

        api_link = constants.posts_api_url

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = {'slug': gallery_slug}

        response = request_with_retries(
            api_link,
            request_dict,
            post=False,
        )

        if not response:
            return None

        response.encoding = 'utf-8'
        try:
            response_data = response.json()
        except (ValueError, KeyError):
            logger.error(
                "Could not parse response to JSON: {}".format(api_link))
            return None

        tags = []
        thumbnail_url = ''

        if len(response_data) < 1:
            return None

        api_gallery = response_data[0]

        soup = BeautifulSoup(api_gallery['content']['rendered'], 'html.parser')

        artists_container = soup.find_all("a",
                                          href=re.compile(constants.main_page +
                                                          '/artist/.*/$'))

        for artist in artists_container:
            tags.append("artist:{}".format(artist.get_text()))

        tags_container = soup.find_all("a",
                                       href=re.compile(constants.main_page +
                                                       '/tag/.*/$'))

        for tag in tags_container:
            tags.append(tag.get_text())

        thumbnail_small_container = soup.find("img")
        if thumbnail_small_container:
            thumbnail_url = thumbnail_small_container.get('src')

        gallery = GalleryData(
            gallery_slug,
            self.name,
            link=link,
            title=unescape(api_gallery['title']['rendered']),
            comment='',
            thumbnail_url=thumbnail_url,
            category='Manga',
            uploader='',
            posted=datetime.strptime(api_gallery['date_gmt'] + '+0000',
                                     "%Y-%m-%dT%H:%M:%S%z"),
            filecount=0,
            filesize=0,
            expunged=False,
            rating='',
            tags=translate_tag_list(tags),
            content=api_gallery['content']['rendered'],
        )

        return gallery