Ejemplo n.º 1
0
def map_external_gallery_data_to_internal(
        gallery_data: DataDict) -> GalleryData:
    internal_gallery_data = GalleryData(
        gallery_data['gid'],
        token=gallery_data['token'],
        archiver_key=gallery_data['archiver_key'],
        title=unescape(gallery_data['title']),
        title_jpn=unescape(gallery_data['title_jpn']),
        thumbnail_url=gallery_data['thumb'],
        category=gallery_data['category'],
        provider=constants.provider_name,
        uploader=gallery_data['uploader'],
        posted=datetime.fromtimestamp(int(gallery_data['posted']),
                                      timezone.utc),
        filecount=gallery_data['filecount'],
        filesize=gallery_data['filesize'],
        expunged=gallery_data['expunged'],
        rating=gallery_data['rating'],
        tags=translate_tag_list(gallery_data['tags']),
    )
    m = re.search(constants.default_fjord_tags,
                  ",".join(internal_gallery_data.tags))
    if m:
        internal_gallery_data.fjord = True
    if internal_gallery_data.thumbnail_url and constants.ex_thumb_url in internal_gallery_data.thumbnail_url:
        internal_gallery_data.thumbnail_url = internal_gallery_data.thumbnail_url.replace(
            constants.ex_thumb_url, constants.ge_thumb_url)
    return internal_gallery_data
Ejemplo n.º 2
0
    def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            link,
            request_dict,
            post=False,
        )

        if not response:
            return None

        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')

        if soup:
            title_jpn_match = soup.find("div", id=re.compile("info")).h2

            gallery_id_match = re.search(
                r'{}(\d+)'.format(constants.gallery_container_url), link)

            if not gallery_id_match:
                return None
            gallery_id = 'nh-' + gallery_id_match.group(1)

            gallery = GalleryData(gallery_id, self.name)
            gallery.title = soup.h1.get_text()
            gallery.title_jpn = title_jpn_match.get_text(
            ) if title_jpn_match else ''
            gallery_filecount_match = re.search(r'<div>(\d+) page(s*)</div>',
                                                response.text)
            if gallery_filecount_match:
                gallery.filecount = int(gallery_filecount_match.group(1))
            else:
                gallery.filecount = 0
            gallery.tags = []
            gallery.link = link
            gallery.posted = dateutil.parser.parse(
                soup.find("time")['datetime'])

            for tag_container in soup.find_all("a", {"class": "tag"}):
                tag_name = [text for text in tag_container.stripped_strings][0]
                tag_name = tag_name.split(" | ")[0]
                tag_scope = tag_container.parent.parent.get_text()
                tag_ext = tag_container.parent.get_text()
                tag_scope = tag_scope.replace(tag_ext, "").replace(
                    "\t", "").replace("\n", "").replace(":", "").lower()
                if tag_scope == 'tags':
                    gallery.tags.append(translate_tag(tag_name))
                elif tag_scope == 'categories':
                    gallery.category = tag_name.capitalize()
                else:
                    gallery.tags.append(
                        translate_tag(tag_scope + ":" + tag_name))

        else:
            return None
        return gallery
Ejemplo n.º 3
0
    def test_nhentai_parser(self):
        """Test Nhentai gallery page parser"""
        settings = Settings(load_from_disk=True)

        gallery_link = 'https://nhentai.net/g/198482/'
        parser = NhentaiParser(settings)
        data = parser.fetch_gallery_data(gallery_link)

        expected_data = GalleryData(
            'nh-198482',
            'nhentai',
            title=
            '(C90) [MeltdoWN COmet (Yukiu Con)] C90 Omakebon! (Pokémon GO) [English] [ATF]',
            title_jpn='(C90) [MeltdoWN COmet (雪雨こん)] C90 おまけ本! (ポケモンGO) [英訳]',
            filecount=9,
            link='https://nhentai.net/g/198482/',
            posted=dateutil.parser.parse('2017-06-19T10:33:19.022360+00:00'),
            category='Doujinshi',
            tags=[
                'parody:pokemon',
                'lolicon',
                'sole_female',
                'sole_male',
                'b*****b',
                'artist:yukiu_con',
                'group:meltdown_comet',
                'language:translated',
                'language:english',
            ])

        self.assertEqual(data, expected_data)
Ejemplo n.º 4
0
    def crawl_urls(self,
                   urls: List[str],
                   wanted_filters=None,
                   wanted_only: bool = False) -> None:

        unique_urls = set()
        gallery_data_list = []
        gallery_wanted_lists = defaultdict(list)

        if not self.downloaders:
            self.logger.warning('No downloaders enabled, returning.')
            return

        for url in urls:

            if not any(word in url for word in self.accepted_urls):
                self.logger.warning("Invalid URL, skipping: {}".format(url))
                continue

            url = url.replace(constants.base_url, constants.old_base_url)

            unique_urls.add(url)

        for gallery_url in unique_urls:
            gallery_data = GalleryData(gallery_url, link=gallery_url)
            gallery_data_list.append(gallery_data)

        self.pass_gallery_data_to_downloaders(gallery_data_list,
                                              gallery_wanted_lists)
Ejemplo n.º 5
0
    def compare_by_title(self, gallery_title: str) -> bool:

        api_url = urljoin(self.own_settings.metadata_url, constants.api_path)
        logger.info("Querying URL: {}".format(api_url))

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = {'match': True, 'title': gallery_title}

        response = request_with_retries(
            api_url,
            request_dict,
            post=False, retries=3
        )

        if not response:
            logger.info("Got no response from server")
            return False

        response_data = response.json()

        matches_links = set()

        if 'error' in response_data:
            logger.info("Got error from server: {}".format(response_data['error']))
            return False

        for gallery in response_data:
            if 'link' in gallery:
                matches_links.add(gallery['link'])
                if 'gallery_container' in gallery and gallery['gallery_container']:
                    if self.settings.gallery_model:
                        gallery_container = self.settings.gallery_model.objects.filter(
                            gid=gallery['gallery_container'], provider=gallery['provider']
                        )
                        first_gallery_container = gallery_container.first()
                        if first_gallery_container:
                            gallery['gallery_container_gid'] = first_gallery_container.gid
                if 'magazine' in gallery and gallery['magazine']:
                    if self.settings.gallery_model:
                        magazine = self.settings.gallery_model.objects.filter(
                            gid=gallery['magazine'], provider=gallery['provider']
                        )
                        first_magazine = magazine.first()
                        if first_magazine:
                            gallery['magazine_gid'] = first_magazine.gid
                if 'posted' in gallery:
                    if gallery['posted'] != 0:
                        gallery['posted'] = datetime.fromtimestamp(int(gallery['posted']), timezone.utc)
                    else:
                        gallery['posted'] = None
                self.values_array.append(GalleryData(**gallery))

        self.gallery_links = list(matches_links)
        if len(self.gallery_links) > 0:
            self.found_by = self.name
            return True
        else:
            return False
Ejemplo n.º 6
0
    def crawl_urls(self, urls: list[str], wanted_filters=None, wanted_only: bool = False) -> None:

        unique_urls = set()
        gallery_data_list = []
        gallery_wanted_lists: dict[str, list['WantedGallery']] = defaultdict(list)

        if not self.downloaders:
            logger.warning('No downloaders enabled, returning.')
            return

        for url in urls:
            unique_urls.add(url)

        for gallery_url in unique_urls:
            gallery_data = GalleryData(gallery_url, self.name, link=gallery_url)
            gallery_data_list.append(gallery_data)

        self.pass_gallery_data_to_downloaders(gallery_data_list, gallery_wanted_lists)
Ejemplo n.º 7
0
    def crawl_urls(self, urls: list[str], wanted_filters=None, wanted_only: bool = False) -> None:

        unique_urls = set()
        gallery_data_list = []
        gallery_wanted_lists: dict[str, list['WantedGallery']] = defaultdict(list)

        if not self.downloaders:
            logger.warning('No downloaders enabled, returning.')
            return

        for url in urls:

            if not any(word in url for word in self.accepted_urls):
                logger.warning("Invalid URL, skipping: {}".format(url))
                continue

            if constants.torrent_download_path in url:
                utilities.view_link_from_download_link(url)

            unique_urls.add(url)

        for gallery in unique_urls:
            gid = self.id_from_url(gallery)
            if not gid:
                continue

            discard_approved, discard_message = self.discard_gallery_by_internal_checks(
                gallery_id=gid,
                link=gallery
            )

            if discard_approved:
                if not self.settings.silent_processing:
                    logger.info(discard_message)
                continue

            gallery_data = GalleryData(gid, self.name, link=gallery)
            gallery_data_list.append(gallery_data)

        if not gallery_data_list:
            return

        self.pass_gallery_data_to_downloaders(gallery_data_list, gallery_wanted_lists)
Ejemplo n.º 8
0
    def test_nexus_parser(self):
        """Test Nexus gallery page parser"""
        settings = Settings(load_from_disk=True)

        gallery_link = 'https://hentainexus.com/view/5665'
        parser = NexusParser(settings)
        data = parser.fetch_gallery_data(gallery_link)

        expected_data = GalleryData(
            '5665',
            'nexus',
            link=gallery_link,
            archiver_key='https://hentainexus.com/zip/5665',
            title='Sase-san is Very Popular',
            thumbnail_url=
            'https://static.hentainexus.com/content/5665/cover.jpg',
            filecount=16,
            filesize=0,
            expunged=False,
            posted=None,
            category='Manga',
            tags=[
                'artist:wantan_meo',
                'language:english',
                'magazine:comic_kairakuten_2019-04',
                'parody:original_work',
                'publisher:fakku',
                'creampie',
                'fangs',
                'hairy',
                'hentai',
                'office_lady',
                'oppai',
                'uncensored',
                'vanilla',
            ],
            comment='Let\'s chug \'em down! ♪',
        )

        self.assertEqual(data, expected_data)
Ejemplo n.º 9
0
    def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            link,
            request_dict,
            post=False,
        )

        if not response:
            return None

        response.encoding = 'utf-8'

        match_string = re.compile(constants.main_page + '/(.+)/$')

        tags = []

        soup = BeautifulSoup(response.text, 'html.parser')

        content_container = soup.find("div", class_="content")

        if not content_container:
            return None

        artists_container = content_container.find_all(
            "a", href=re.compile(constants.main_page + '/artist/.*/$'))

        for artist in artists_container:
            tags.append("artist:{}".format(artist.get_text()))

        tags_container = content_container.find_all(
            "a", href=re.compile(constants.main_page + '/tag/.*/$'))

        for tag in tags_container:
            tags.append(tag.get_text())

        # thumbnail_small_container = content_container.find("img")
        # if thumbnail_small_container:
        #     thumbnail_url = thumbnail_small_container.get('src')
        thumbnail_url = soup.find("meta", property="og:image")

        match_result = match_string.match(soup.find("meta",
                                                    property="og:link"))
        if not match_result:
            return None

        gallery = GalleryData(
            match_result.group(1),
            self.name,
            link=link,
            title=soup.find("meta", property="og:title"),
            comment='',
            thumbnail_url=thumbnail_url,
            category='Manga',
            uploader='',
            posted=None,
            filecount=0,
            filesize=0,
            expunged=False,
            rating='',
            tags=translate_tag_list(tags),
            content=content_container.encode_contents(),
        )

        return gallery
Ejemplo n.º 10
0
    def crawl_feed(self, feed_url: str = '') -> list[GalleryData]:

        if not feed_url:
            feed_url = constants.rss_url

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            feed_url,
            request_dict,
            post=False,
        )

        if not response:
            logger.error("Got no response from feed URL: {}".format(feed_url))
            return []

        response.encoding = 'utf-8'

        feed = feedparser.parse(response.text)

        galleries = []

        match_string = re.compile(constants.main_page + '/(.+)/$')
        skip_tags = ['Uncategorized']

        logger.info(
            "Provided RSS URL for provider ({}), adding {} found links".format(
                self.name, len(feed['items'])))

        for item in feed['items']:
            tags = [x.term for x in item['tags'] if x.term not in skip_tags]

            thumbnail_url = ''

            for content in item['content']:
                soup = BeautifulSoup(content.value, 'html.parser')

                artists_container = soup.find_all(
                    "a", href=re.compile(constants.main_page + '/artist/.*/$'))

                for artist in artists_container:
                    tags.append("artist:{}".format(artist.get_text()))

                thumbnail_small_container = soup.find("img")
                if thumbnail_small_container:
                    thumbnail_url = thumbnail_small_container.get('src')

            match_result = match_string.match(item['link'])
            if not match_result:
                continue

            gallery = GalleryData(match_result.group(1),
                                  self.name,
                                  title=item['title'],
                                  comment=item['description'],
                                  thumbnail_url=thumbnail_url,
                                  category='Manga',
                                  uploader=item['author'],
                                  posted=datetime.strptime(
                                      item['published'],
                                      "%a, %d %b %Y %H:%M:%S %z"),
                                  filecount=0,
                                  filesize=0,
                                  expunged=False,
                                  rating='',
                                  tags=translate_tag_list(tags),
                                  content=item['content'][0].value,
                                  link=item['link'])

            # Must check here since this method is called after the main check in crawl_urls
            if self.general_utils.discard_by_tag_list(gallery.tags):
                continue

            if not gallery.link:
                continue

            discard_approved, discard_message = self.discard_gallery_by_internal_checks(
                gallery.gid, link=gallery.link)
            if discard_approved:
                if not self.settings.silent_processing:
                    logger.info(discard_message)
                continue

            galleries.append(gallery)

        return galleries
Ejemplo n.º 11
0
    def work_gallery_data(self, gallery: GalleryData,
                          gallery_wanted_lists) -> None:

        if not gallery.token:
            return

        m = re.search(constants.default_fjord_tags, ",".join(gallery.tags))
        if m:
            gallery.root = constants.ex_page
            gallery.link = link_from_gid_token_fjord(gallery.gid,
                                                     gallery.token, True)
        else:
            gallery.root = constants.ge_page
            gallery.link = link_from_gid_token_fjord(gallery.gid,
                                                     gallery.token, False)

        self.logger.info("Title: {}. Link: {}".format(gallery.title,
                                                      gallery.link))

        gallery_is_hidden = False
        gallery.hidden = False

        if self.own_settings.crawl_gallery_page:
            gallery_page_text = requests.get(
                gallery.link,
                cookies=self.own_settings.cookies,
                headers=self.settings.requests_headers,
                timeout=self.settings.timeout_timer).text

            if 'Gallery Not Available' in gallery_page_text:
                if gallery.root == constants.ex_page:
                    self.logger.warning(
                        'EX Gallery not available, probably taken down/hidden')
                    gallery_is_hidden = True
                    gallery.hidden = True
                elif gallery.root == constants.ge_page:
                    time.sleep(self.settings.wait_timer)
                    gallery.root = constants.ex_page
                    gallery.link = link_from_gid_token_fjord(
                        gallery.gid, gallery.token, True)
                    retry_gallery_page_text = requests.get(
                        gallery.link,
                        cookies=self.own_settings.cookies,
                        headers=self.settings.requests_headers,
                        timeout=self.settings.timeout_timer).text
                    if 'Gallery Not Available' in retry_gallery_page_text:
                        self.logger.warning(
                            'Tried EX gallery instead of E-H for '
                            'arbitrary hidden galleries, also not '
                            'available, probably taken down/hidden')
                        gallery_is_hidden = True
                        gallery.hidden = True
                    else:
                        self.logger.info(
                            "Changed from E-H to EX, was a arbitrary hidden gallery"
                        )
                        gallery_page_text = retry_gallery_page_text
                        time.sleep(self.settings.wait_timer)
                        new_gallery = self.fetch_gallery_data(gallery.link)
                        if new_gallery and new_gallery.token:
                            gallery = new_gallery
                            gallery.root = constants.ex_page
                            gallery.link = link_from_gid_token_fjord(
                                new_gallery.gid, new_gallery.token, True)

            time.sleep(self.settings.wait_timer)

            if not gallery_is_hidden:
                gallery_parser = GalleryHTMLParser()
                gallery_parser.feed(gallery_page_text)

                # What this does is log a parent gallery if we are downloading a newer one.
                # Maybe this should be an option, because we are deleting files from disk.
                if gallery_parser.found_parent_gallery and gallery_parser.parent_gallery:
                    parent_gid, parent_token = get_gid_token_from_link(
                        gallery_parser.parent_gallery)
                    archives = Archive.objects.filter(
                        gallery__gid__exact=parent_gid,
                        gallery__token__exact=parent_token)
                    for archive in archives:
                        self.logger.warning(
                            "Gallery: {} has a parent gallery {}, "
                            "which is matched with archive: {}. You might want to delete them."
                            .format(gallery.link,
                                    archive.gallery.get_absolute_url(),
                                    archive.get_absolute_url()))

                if (gallery_parser.found_non_final_gallery == 2
                        and self.own_settings.get_newer_gallery
                        and gallery_parser.non_final_gallery):
                    self.logger.info(
                        "Found non final gallery, next is: {}".format(
                            gallery_parser.non_final_gallery))
                    (exists_next,
                     new_gallery) = self.get_final_gallery_from_link(
                         gallery_parser.non_final_gallery)
                    if exists_next == 1 and new_gallery and new_gallery.link:
                        gallery.dl_type = 'skipped:non_final'
                        Gallery.objects.update_or_create_from_values(gallery)
                        discard_approved, discard_message = self.discard_gallery_by_internal_checks(
                            gallery_id=new_gallery.gid, link=new_gallery.link)
                        if discard_approved:
                            if not self.settings.silent_processing:
                                self.logger.info(discard_message)
                            return
                        gallery = new_gallery
                        self.logger.info("Final gallery is: {}".format(
                            new_gallery.link))
                    elif exists_next == 0:
                        self.logger.info(
                            "Last in sequence: {} is not available, using current one"
                            .format(gallery.link))
                    elif exists_next == 2:
                        self.logger.info(
                            "Last in sequence: {} was discarded by global tags, "
                            "skipping this gallery altogether".format(
                                gallery.link))
                        gallery.dl_type = 'skipped:final_replaced'
                        Gallery.objects.update_or_create_from_values(gallery)
                        return

        retry_attempt = True
        while retry_attempt:

            for cnt, downloader in enumerate(self.downloaders):
                if not gallery_is_hidden or not downloader[0].skip_if_hidden:
                    downloader[0].init_download(copy.deepcopy(gallery))
                else:
                    downloader[0].return_code = 0
                if downloader[0].return_code == 1:
                    for wanted_gallery in gallery_wanted_lists[gallery.gid]:
                        FoundGallery.objects.get_or_create(
                            wanted_gallery=wanted_gallery,
                            gallery=downloader[0].gallery_db_entry)
                        if wanted_gallery.add_as_hidden and downloader[
                                0].gallery_db_entry:
                            downloader[0].gallery_db_entry.hidden = True
                            downloader[0].gallery_db_entry.save()
                        if downloader[
                                0].archive_db_entry and wanted_gallery.reason:
                            downloader[
                                0].archive_db_entry.reason = wanted_gallery.reason
                            downloader[0].archive_db_entry.simple_save()

                    if len(gallery_wanted_lists[gallery.gid]) > 0:
                        wanted_gallery_found.send(
                            sender=self.settings.gallery_model,
                            gallery=downloader[0].gallery_db_entry,
                            wanted_gallery_list=gallery_wanted_lists[
                                gallery.gid])

                    self.last_used_downloader = str(downloader[0])
                    if downloader[0].gallery_db_entry:
                        if downloader[0].archive_db_entry:
                            self.logger.info(
                                "Download completed successfully, using {}. Archive link: {}. Gallery link: {}"
                                .format(
                                    downloader[0], downloader[0].
                                    archive_db_entry.get_absolute_url(),
                                    downloader[0].gallery_db_entry.
                                    get_absolute_url()))
                        else:
                            self.logger.info(
                                "Download completed successfully (gallery only), using {}. Gallery link: {}"
                                .format(
                                    downloader[0], downloader[0].
                                    gallery_db_entry.get_absolute_url()))
                    return
                if (downloader[0].return_code == 0
                        and (cnt + 1) == len(self.downloaders)):
                    if gallery.root == constants.ge_page and not gallery_is_hidden and gallery.token:
                        gallery.root = constants.ex_page
                        gallery.link = link_from_gid_token_fjord(
                            gallery.gid, gallery.token, True)
                        # fetch archiver key again when retrying.
                        new_gallery_data = self.fetch_gallery_data(
                            gallery.link)
                        if new_gallery_data:
                            gallery.archiver_key = new_gallery_data.archiver_key
                            self.logger.info(
                                "Retrying with fjord link, might be hidden.")
                            break
                        else:
                            self.logger.error("Could not fetch fjord link.")
                    else:
                        self.logger.error(
                            "Finished retrying using fjord link.")
                    downloader[0].original_gallery = gallery
                    downloader[0].original_gallery.hidden = True
                    downloader[0].original_gallery.dl_type = 'failed'
                    downloader[0].update_gallery_db()
                    if downloader[0].gallery_db_entry:
                        self.last_used_downloader = 'none'
                        self.logger.warning(
                            "Download completed unsuccessfully, set as failed. Gallery link: {}"
                            .format(downloader[0].gallery_db_entry.
                                    get_absolute_url()))
                        for wanted_gallery in gallery_wanted_lists[
                                gallery.gid]:
                            FoundGallery.objects.get_or_create(
                                wanted_gallery=wanted_gallery,
                                gallery=downloader[0].gallery_db_entry)
                    retry_attempt = False
Ejemplo n.º 12
0
    def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]:

        request_dict = construct_request_dict(self.settings, self.own_settings)

        response = request_with_retries(
            link,
            request_dict,
            post=False,
        )

        if not response:
            return None

        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')
        gallery_container_head = soup.find("head")

        if gallery_container_head:
            gid_container = gallery_container_head.find("meta",
                                                        property="og:url")

            if gid_container:
                url_parts = gid_container['content'].split('/')
                gid = url_parts[-1]
                gallery = GalleryData(gid, self.name)
                gallery.link = link
                gallery.tags = []
                gallery.category = 'Doujinshi'

                title_container = gallery_container_head.find(
                    "meta", property="og:title")
                image_url_container = gallery_container_head.find(
                    "meta", property="og:image")
                tags_containers = gallery_container_head.find_all(
                    "meta", property="book:tag")
                page_count_container = gallery_container_head.find(
                    "meta", property="books:page_count")
                description_container = gallery_container_head.find(
                    "meta", property="og:description")
                author_container = soup.find("span", itemprop="author")
                section_container = soup.find(
                    "section", class_="showcase_comic_single_description")
                gallery_container_titles = soup.find(
                    "hgroup", class_='showcase_comic_single_main_titles')

                if title_container:
                    gallery.title = title_container['content']
                if gallery_container_titles:
                    title_jpn_container = gallery_container_titles.find(
                        "span", lang='ja')
                    if title_jpn_container:
                        gallery.title_jpn = title_jpn_container.get_text()
                if image_url_container:
                    gallery.thumbnail_url = image_url_container['content']
                if description_container:
                    gallery.comment = description_container['content']
                if page_count_container:
                    gallery.filecount = int(page_count_container['content'])
                if author_container:
                    group_name = author_container.find("meta", itemprop="name")
                    if group_name:
                        gallery.tags.append(
                            translate_tag("group:" + group_name['content']))
                if section_container:
                    time_container = section_container.find(
                        "time", itemprop="datePublished")
                    if time_container:
                        gallery.posted = datetime.fromisoformat(
                            time_container['datetime'] + '+00:00')
                    p_containers = section_container.find_all("p")
                    for ps in p_containers:
                        p_text = ps.get_text()
                        if 'Source:' in p_text:
                            parody_name = p_text.replace('Source: ',
                                                         '').rstrip()
                            gallery.tags.append(
                                translate_tag("parody:" + parody_name))

                for tag_container in tags_containers:
                    tag = translate_tag(tag_container['content'])
                    gallery.tags.append(tag)

                gallery.tags.append(translate_tag("language:english"))

            else:
                return None
        else:
            return None
        return gallery
Ejemplo n.º 13
0
    def test_fakku_parser(self):
        """Test FAKKU gallery page parser"""
        settings = Settings(load_from_disk=True)

        gallery_link = 'https://www.fakku.net/hentai/im-a-piece-of-junk-sexaroid-english'
        parser = FakkuParser(settings)
        data = parser.fetch_gallery_data(gallery_link)

        expected_data = GalleryData(
            'hentai/im-a-piece-of-junk-sexaroid-english',
            'fakku',
            link=gallery_link,
            title='I\'m a Piece of Junk Sexaroid',
            thumbnail_url=
            'https://t.fakku.net/images/manga/i/im-a-piece-of-junk-sexaroid-english/thumbs/002.thumb.jpg',
            filecount=16,
            category='Manga',
            tags=[
                'artist:wakame-san',
                'magazine:comic_kairakuten_beast_2017-05',
                'publisher:fakku',
                'language:english',
                'tsundere',
                'femdom',
                'vanilla',
                'b*****b',
                'oppai',
                'hentai',
                'creampie',
                'uncensored',
                'x-ray',
                'subscription',
            ],
            comment='Plump slacker sex robot ❤',
        )

        self.assertEqual(data, expected_data)

        gallery_link = 'https://www.fakku.net/hentai/tsf-story-append-20-english_1497401155'
        parser = FakkuParser(settings)
        data = parser.fetch_gallery_data(gallery_link)

        expected_data = GalleryData(
            'hentai/tsf-story-append-20-english_1497401155',
            'fakku',
            link=gallery_link,
            title='TSF Story Append 2.0',
            filecount=82,
            category='Doujinshi',
            tags=[
                'artist:oda_non',
                'artist:yasui_riosuke',
                'artist:meme50',
                'artist:kojima_saya',
                'artist:butcha-u',
                'artist:mizuryu_kei',
                'artist:kurenai_yuuji',
                'artist:soine',
                'artist:asanagi',
                'artist:yumeno_tanuki',
                'artist:hiroyuki_sanadura',
                'artist:shindo_l',
                'artist:naokame',
                'artist:kin_no_hiyoko',
                'artist:masaru_yajiro',
                'group:da_hootch',
                'publisher:enshodo',
                'language:english',
                'anal',
                'b*****b',
                'oppai',
                'glasses',
                'stockings',
                'group',
                'nurse',
                'hentai',
                'ahegao',
                'creampie',
                'uncensored',
                'genderbend',
                'doujin',
            ],
            comment=
            "Takumi's life as a girl only continues to get more wild, as he (she?) continues to fall deeper into a life of promiscuity, drugs and unprotected sex with strangers. Will his friend Ryou be able to pull him out of this terrible spiral?",
            thumbnail_url=
            'https://t.fakku.net/images/manga/t/tsf-story-append-20-english_1497401155_1502575464/thumbs/001.thumb.jpg',
        )

        self.assertEqual(data, expected_data)
Ejemplo n.º 14
0
    def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]:

        response = request_with_retries(link, {
            'headers': self.settings.requests_headers,
            'timeout': self.settings.timeout_timer,
        },
                                        post=False,
                                        logger=self.logger)

        if not response:
            self.logger.error("Got no response from: {}".format(link))
            return None

        response.encoding = 'utf-8'

        match_string = re.compile(constants.main_page + r"/view/(\d+)/*$")

        tags = []

        soup = BeautifulSoup(response.text, 'html.parser')

        content_container = soup.find_all("div", class_="container")[0]

        if not content_container:
            self.logger.error("Could not find content container")
            return None

        is_doujinshi = False

        artists_container = content_container.find_all(
            "a", href=re.compile('/?q=artist:.*$'))

        for artist in artists_container:
            tags.append("artist:{}".format(artist.get_text()))

        language_container = content_container.find_all(
            "a", href=re.compile('/?q=language:.*$'))

        for language in language_container:
            tags.append("language:{}".format(language.get_text()))

        magazine_container = content_container.find_all(
            "a", href=re.compile('/?q=magazine:.*$'))

        for magazine in magazine_container:
            tags.append("magazine:{}".format(magazine.get_text()))

        parody_container = content_container.find_all(
            "a", href=re.compile('/?q=parody:.*$'))

        for parody in parody_container:
            tags.append("parody:{}".format(parody.get_text()))

        publisher_container = content_container.find_all(
            "a", href=re.compile('/?q=publisher:.*$'))

        for publisher in publisher_container:
            tags.append("publisher:{}".format(publisher.get_text()))

        tags_container = content_container.find_all(
            "a", href=re.compile('/?q=tag:.*$'))

        for tag in tags_container:
            tag_cleaned = tag.get_text().replace("\t", "").replace("\n", "")
            tags.append(tag_cleaned)

            if tag_cleaned == 'doujin':
                is_doujinshi = True

        thumbnail_url = soup.find("meta", property="og:image").get('content')

        match_result = match_string.match(
            soup.find("meta", property="og:url").get('content'))
        if not match_result:
            self.logger.error("Could not find gallery info container")
            return None

        gallery_id = match_result.group(1)

        gallery = GalleryData(
            gallery_id,
            link=link,
            title=content_container.find("h1", class_="title").get_text(),
            thumbnail_url=thumbnail_url,
            provider=self.name,
            posted=None,
            filesize=0,
            expunged=False,
            tags=translate_tag_list(tags),
        )

        table_container = content_container.find("table",
                                                 class_="view-page-details")

        if table_container:
            tr_container = table_container.find_all("tr")

            for tr in tr_container:

                if isinstance(tr, bs4.element.Tag):

                    td_container = tr.find_all("td")

                    is_description = False
                    is_pages = False

                    for td in td_container:
                        if is_description:
                            gallery.comment = td.get_text().replace(
                                "\t", "").replace("\n", "")
                            is_description = False
                        if isinstance(td, bs4.element.Tag) and td.get_text(
                        ) == 'Description':
                            is_description = True

                        if is_pages:
                            right_text = td.get_text().replace("\t",
                                                               "").replace(
                                                                   "\n", "")
                            m = re.search(r'(\d+)', right_text)
                            if m:
                                gallery.filecount = int(m.group(1))
                            is_pages = False
                        if isinstance(
                                td,
                                bs4.element.Tag) and td.get_text() == 'Pages':
                            is_pages = True

        gallery.archiver_key = "{}/zip/{}".format(constants.main_page,
                                                  gallery_id)

        if is_doujinshi:
            gallery.category = 'Doujinshi'
        else:
            gallery.category = 'Manga'

        return gallery
Ejemplo n.º 15
0
    def get_values_from_gallery_link_json(self,
                                          link: str) -> Optional[GalleryData]:

        match_string = re.compile(constants.main_page + '/(.+)/$')

        m = match_string.match(link)

        if m:
            gallery_slug = m.group(1)
        else:
            return None

        api_link = constants.posts_api_url

        request_dict = construct_request_dict(self.settings, self.own_settings)
        request_dict['params'] = {'slug': gallery_slug}

        response = request_with_retries(
            api_link,
            request_dict,
            post=False,
        )

        if not response:
            return None

        response.encoding = 'utf-8'
        try:
            response_data = response.json()
        except (ValueError, KeyError):
            logger.error(
                "Could not parse response to JSON: {}".format(api_link))
            return None

        tags = []
        thumbnail_url = ''

        if len(response_data) < 1:
            return None

        api_gallery = response_data[0]

        soup = BeautifulSoup(api_gallery['content']['rendered'], 'html.parser')

        artists_container = soup.find_all("a",
                                          href=re.compile(constants.main_page +
                                                          '/artist/.*/$'))

        for artist in artists_container:
            tags.append("artist:{}".format(artist.get_text()))

        tags_container = soup.find_all("a",
                                       href=re.compile(constants.main_page +
                                                       '/tag/.*/$'))

        for tag in tags_container:
            tags.append(tag.get_text())

        thumbnail_small_container = soup.find("img")
        if thumbnail_small_container:
            thumbnail_url = thumbnail_small_container.get('src')

        gallery = GalleryData(
            gallery_slug,
            self.name,
            link=link,
            title=unescape(api_gallery['title']['rendered']),
            comment='',
            thumbnail_url=thumbnail_url,
            category='Manga',
            uploader='',
            posted=datetime.strptime(api_gallery['date_gmt'] + '+0000',
                                     "%Y-%m-%dT%H:%M:%S%z"),
            filecount=0,
            filesize=0,
            expunged=False,
            rating='',
            tags=translate_tag_list(tags),
            content=api_gallery['content']['rendered'],
        )

        return gallery
Ejemplo n.º 16
0
    def crawl_json(self,
                   json_string: str,
                   wanted_filters: QuerySet = None,
                   wanted_only: bool = False) -> None:

        if not self.settings.gallery_model:
            return

        dict_list = []
        json_decoded = json.loads(json_string)

        if type(json_decoded) == dict:
            dict_list.append(json_decoded)
        elif type(json_decoded) == list:
            dict_list = json_decoded

        galleries_gids = []
        found_galleries = set()
        total_galleries_filtered: List[GalleryData] = []
        gallery_wanted_lists: Dict[str,
                                   List['WantedGallery']] = defaultdict(list)

        for gallery in dict_list:
            galleries_gids.append(gallery['gid'])
            gallery['posted'] = datetime.fromtimestamp(int(gallery['posted']),
                                                       timezone.utc)
            gallery_data = GalleryData(**gallery)
            total_galleries_filtered.append(gallery_data)

        for galleries_gid_group in list(chunks(galleries_gids, 900)):
            for found_gallery in self.settings.gallery_model.objects.filter(
                    gid__in=galleries_gid_group):
                discard_approved, discard_message = self.discard_gallery_by_internal_checks(
                    gallery=found_gallery, link=found_gallery.get_link())

                if discard_approved:
                    self.logger.info(discard_message)
                    found_galleries.add(found_gallery.gid)

        for count, gallery in enumerate(total_galleries_filtered):

            if gallery.gid in found_galleries:
                continue

            if self.general_utils.discard_by_tag_list(gallery.tags):
                self.logger.info(
                    "Gallery {} of {}: Skipping gallery {}, because it's tagged with global discarded tags"
                    .format(count, len(total_galleries_filtered),
                            gallery.title))
                continue

            if wanted_filters:
                self.compare_gallery_with_wanted_filters(
                    gallery, gallery.link, wanted_filters,
                    gallery_wanted_lists)
                if wanted_only and not gallery_wanted_lists[gallery.gid]:
                    continue

            self.logger.info(
                "Gallery {} of {}:  Gallery {} will be processed.".format(
                    count, len(total_galleries_filtered), gallery.title))

            if gallery.thumbnail:
                original_thumbnail_url = gallery.thumbnail_url

                gallery.thumbnail_url = gallery.thumbnail

                gallery_instance = self.settings.gallery_model.objects.update_or_create_from_values(
                    gallery)

                gallery_instance.thumbnail_url = original_thumbnail_url

                gallery_instance.save()
            else:
                self.settings.gallery_model.objects.update_or_create_from_values(
                    gallery)
Ejemplo n.º 17
0
    def get_values_from_gallery_link(self, link: str) -> Optional[GalleryData]:

        response = request_with_retries(link, {
            'headers': self.settings.requests_headers,
            'timeout': self.settings.timeout_timer,
            'cookies': self.own_settings.cookies
        },
                                        post=False,
                                        logger=self.logger)

        if not response:
            return None

        response.encoding = 'utf-8'
        new_text = re.sub(r'(<div class="right">\d+?)</b>', r'\1',
                          response.text)
        soup = BeautifulSoup(new_text, 'html.parser')
        gallery_container = soup.find("div", class_=re.compile("content-wrap"))

        if gallery_container:
            gallery = GalleryData(
                link.replace(constants.main_url + '/',
                             '').replace('manga/', 'hentai/'))
            gallery.link = link
            gallery.tags = []
            gallery.provider = self.name
            gallery.title = gallery_container.find(
                "div", class_="content-name").h1.get_text()

            if gallery.gid.startswith('manga') or gallery.gid.startswith(
                    'hentai'):
                gallery.category = 'Manga'
            elif gallery.gid.startswith('doujinshi'):
                gallery.category = 'Doujinshi'

            thumbnail_container = gallery_container.find("img",
                                                         class_="tablet-50")
            if thumbnail_container:
                gallery.thumbnail_url = thumbnail_container.get("src")
                if gallery.thumbnail_url and gallery.thumbnail_url.startswith(
                        '//'):
                    gallery.thumbnail_url = 'https:' + gallery.thumbnail_url

            is_doujinshi = False
            for gallery_row in gallery_container.find_all(
                    "div", {"class": "row"}):
                left_text = gallery_row.find("div", {
                    "class": "row-left"
                }).get_text()
                right_div = gallery_row.find("div", {"class": "row-right"})
                if left_text == "Series" or left_text == "Parody":
                    right_text = right_div.get_text()
                    # if not right_text == "Original Work":
                    gallery.tags.append(translate_tag("parody:" + right_text))
                elif left_text == "Artist":
                    for artist in right_div.find_all("a"):
                        gallery.tags.append(
                            translate_tag("artist:" + artist.get_text()))
                elif left_text == "Magazine":
                    gallery.tags.append(
                        translate_tag("magazine:" + right_div.get_text()))
                elif left_text == "Publisher":
                    gallery.tags.append(
                        translate_tag("publisher:" + right_div.get_text()))
                elif left_text == "Circle":
                    gallery.tags.append(
                        translate_tag("group:" + right_div.get_text()))
                elif left_text == "Event":
                    gallery.tags.append(
                        translate_tag("event:" + right_div.get_text()))
                elif left_text == "Book":
                    belongs_to_container = right_div.find("a")
                    if belongs_to_container:
                        gallery.gallery_container_gid = belongs_to_container.get(
                            "href")[1:]
                elif left_text == "Language":
                    gallery.tags.append(
                        translate_tag("language:" + right_div.get_text()))
                elif left_text == "Pages":
                    right_text = right_div.get_text()
                    m = re.search(r'^(\d+)', right_text)
                    if m:
                        gallery.filecount = int(m.group(1))
                elif left_text == "Uploader":
                    gallery.uploader, right_date_text = right_div.get_text(
                    ).split(' on ')
                    right_date_text = re.sub(r'(\d+)(st|nd|rd|th)', r'\1',
                                             right_date_text)
                    gallery.posted = datetime.strptime(right_date_text,
                                                       "%B %d, %Y")
                elif left_text == "Description":
                    gallery.comment = right_div.get_text()
                elif left_text == "Tags":
                    for tag_a in right_div.find_all(
                            "a", href=lambda x: x and '/tags/' in x):
                        if tag_a.get_text() == 'doujin':
                            is_doujinshi = True
                        gallery.tags.append(translate_tag(tag_a.get_text()))
            if is_doujinshi:
                gallery.category = 'Doujinshi'
            else:
                gallery.category = 'Manga'
        else:
            return None
        return gallery
Ejemplo n.º 18
0
def convert_api_response_text_to_gallery_dicts(text: str) -> list[GalleryData]:
    galleries: list[GalleryData] = []
    # Based on: https://www.doujinshi.org/API_MANUAL.txt
    xml_root = ElementTree.fromstring(text)
    error = xml_root.find('ERROR')
    if error:
        return galleries
    for book in xml_root.findall('BOOK'):

        book_id = book.get('ID')

        if book_id is None:
            continue

        integer_id = int(book_id.replace('B', ''))
        gallery = GalleryData('mugi-' + book_id, constants.provider_name)
        gallery.link = constants.main_page + '/book/' + str(integer_id)
        gallery.tags = []
        found_en_title = book.find('NAME_EN')
        if found_en_title is not None:
            gallery.title = found_en_title.text or ''
        found_jp_title = book.find('NAME_JP')
        if found_jp_title is not None:
            gallery.title_jpn = found_jp_title.text or ''
        gallery.comment = ''
        gallery.category = ''
        gallery.filesize = 0
        found_data_pages = book.find('DATA_PAGES')
        if found_data_pages is not None and found_data_pages.text:
            gallery.filecount = int(found_data_pages.text)
        else:
            gallery.filecount = 0
        gallery.uploader = ''
        gallery.thumbnail_url = 'https://img.doujinshi.org/big/{}/{}.jpg'.format(
            int(integer_id / 2000), integer_id)
        found_user = xml_root.find('USER')
        if found_user is not None:
            found_user_queries = found_user.find('Queries')
            if found_user_queries is not None and found_user_queries.text:
                gallery.queries = int(found_user_queries.text)
            else:
                gallery.queries = 0
        else:
            gallery.queries = 0

        # Check if we get the 0000-00-00 date
        found_date_released = book.find('DATE_RELEASED')
        if found_date_released is not None and found_date_released.text:
            date_components = found_date_released.text.split("-")
            if len(date_components) >= 3:
                if date_components[0] != '0000' and date_components[
                        1] != '00' and date_components[2] != '00':
                    gallery.posted = datetime.strptime(
                        found_date_released.text + ' +0000', '%Y-%m-%d %z')

        found_links = book.find('LINKS')

        if found_links is not None:
            for item in found_links:
                item_type = item.get('TYPE')
                item_name_en = item.find('NAME_EN')
                if item_type == 'author':
                    item_type = 'artist'
                elif item_type == 'circle':
                    item_type = 'group'
                elif item_type == 'type' and item_name_en is not None:
                    gallery.category = item_name_en.text
                    continue
                elif item_type is None or item_type == '':
                    if item_name_en is not None and not (
                            item_name_en.text == ''
                            or item_name_en.text is None):
                        gallery.tags.append(translate_tag(item_name_en.text))
                    continue
                if item_name_en is not None and not (
                        item_name_en.text == '' or item_name_en.text is None):
                    gallery.tags.append(
                        translate_tag(item_type + ":" + item_name_en.text))

        found_data_language = book.find('DATA_LANGUAGE')

        if found_data_language is not None and not (
                found_data_language.text == ''
                or found_data_language.text is None):
            gallery.tags.append(
                translate_tag(
                    "language:" +
                    translate_language_code(found_data_language.text)))

        # Add non-h as a tag.
        found_age = book.find('DATA_AGE')
        if found_age is not None and not (
                found_age.text == ''
                or found_age.text is None) and found_age.text == '0':
            gallery.tags.append(translate_tag('non-h'))

        galleries.append(gallery)

    return galleries
Ejemplo n.º 19
0
    def process_regular_gallery_page(
            self, link: str, response_text: str) -> Optional[GalleryData]:

        soup = BeautifulSoup(response_text, 'html.parser')
        gallery_container = soup.find("div", class_=re.compile("content-wrap"))
        if gallery_container:
            gallery = GalleryData(
                link.replace(constants.main_url + '/',
                             '').replace('manga/', 'hentai/'), self.name)
            gallery.link = link
            gallery.tags = []
            if self.own_settings.get_posted_date_from_feed:
                gallery.posted = self.parse_posted_date_from_feed(
                    constants.aux_feed_url, gallery.gid)
            gallery.title = gallery_container.find(
                "div", class_="content-name").h1.get_text()

            if gallery.gid.startswith('manga') or gallery.gid.startswith(
                    'hentai'):
                gallery.category = 'Manga'
            elif gallery.gid.startswith('doujinshi'):
                gallery.category = 'Doujinshi'

            thumbnail_container = gallery_container.find("img",
                                                         class_="tablet-50")
            if thumbnail_container:
                gallery.thumbnail_url = thumbnail_container.get("src")
                if gallery.thumbnail_url and gallery.thumbnail_url.startswith(
                        '//'):
                    gallery.thumbnail_url = 'https:' + gallery.thumbnail_url

            is_doujinshi = False
            for gallery_row in gallery_container.find_all(
                    "div", {"class": "row"}):
                left_text = gallery_row.find("div", {
                    "class": "row-left"
                }).get_text()
                right_div = gallery_row.find("div", {"class": "row-right"})
                if left_text == "Series" or left_text == "Parody":
                    right_text = right_div.get_text().strip()
                    # if not right_text == "Original Work":
                    gallery.tags.append(translate_tag("parody:" + right_text))
                elif left_text == "Artist":
                    for artist in right_div.find_all("a"):
                        gallery.tags.append(
                            translate_tag("artist:" +
                                          artist.get_text().strip()))
                elif left_text == "Author":
                    for author in right_div.find_all("a"):
                        gallery.tags.append(
                            translate_tag("author:" +
                                          author.get_text().strip()))
                elif left_text == "Magazine":
                    gallery.tags.append(
                        translate_tag("magazine:" +
                                      right_div.get_text().strip()))
                    belongs_to_magazine = right_div.find("a")
                    if belongs_to_magazine:
                        gallery.magazine_gid = belongs_to_magazine.get(
                            "href")[1:]
                elif left_text == "Publisher":
                    gallery.tags.append(
                        translate_tag("publisher:" +
                                      right_div.get_text().strip()))
                elif left_text == "Circle":
                    gallery.tags.append(
                        translate_tag("group:" + right_div.get_text().strip()))
                elif left_text == "Event":
                    gallery.tags.append(
                        translate_tag("event:" + right_div.get_text().strip()))
                elif left_text == "Book":
                    belongs_to_container = right_div.find("a")
                    if belongs_to_container:
                        gallery.gallery_container_gid = belongs_to_container.get(
                            "href")[1:]
                elif left_text == "Language":
                    gallery.tags.append(
                        translate_tag("language:" +
                                      right_div.get_text().strip()))
                elif left_text == "Pages":
                    right_text = right_div.get_text().strip()
                    m = re.search(r'^(\d+)', right_text)
                    if m:
                        gallery.filecount = int(m.group(1))
                elif left_text == "Uploader":
                    gallery.uploader, right_date_text = right_div.get_text(
                    ).strip().split(' on ')
                    right_date_text = re.sub(r'(\d+)(st|nd|rd|th)', r'\1',
                                             right_date_text)
                    gallery.posted = datetime.strptime(right_date_text,
                                                       "%B %d, %Y")
                elif left_text == "Description":
                    gallery.comment = right_div.get_text()
                elif left_text == "Tags":
                    for tag_a in right_div.find_all(
                            "a", href=lambda x: x and '/tags/' in x):
                        if tag_a.get_text().strip() == 'doujin':
                            is_doujinshi = True
                        gallery.tags.append(
                            translate_tag(tag_a.get_text().strip()))
            if is_doujinshi:
                gallery.category = 'Doujinshi'
            else:
                gallery.category = 'Manga'

            return gallery
        else:
            return None
Ejemplo n.º 20
0
    def process_magazine_page(self, link: str,
                              response_text: str) -> Optional[GalleryData]:
        soup = BeautifulSoup(response_text, 'html.parser')
        magazine_container = soup.find("div", class_="wrap")

        comic_regex = re.compile("content-comic")

        if magazine_container:
            gid = link.replace(constants.main_url + '/', '')
            gallery = GalleryData(gid, self.name)
            if self.own_settings.get_posted_date_from_feed:
                gallery.posted = self.parse_posted_date_from_feed(
                    constants.aux_feed_url, gallery.gid)
            gallery.link = link
            gallery.tags = []
            gallery.magazine_chapters_gids = []
            gallery.title = magazine_container.find(
                "a", itemprop="item",
                href=re.compile("/" + gid)).span.get_text()
            gallery.category = 'Manga'  # We assume every magazine is commercial, and we keep using panda definition

            thumbnail_container = magazine_container.find(
                "img", class_="content-poster")
            if thumbnail_container:
                gallery.thumbnail_url = thumbnail_container.get("src")
                if gallery.thumbnail_url and gallery.thumbnail_url.startswith(
                        '//'):
                    gallery.thumbnail_url = 'https:' + gallery.thumbnail_url

            description_container = magazine_container.find(
                "p", class_=re.compile("attribute-description"))

            if description_container:
                comment_text = description_container.decode_contents().replace(
                    "\n", "").replace("<br/>", "\n")
                comment_soup = BeautifulSoup(comment_text, 'html.parser')
                gallery.comment = comment_soup.get_text()

            chapters_container = magazine_container.find_all(
                "div", class_=comic_regex)

            tags_set = set()
            artists_set = set()

            for chapter_container in chapters_container:
                chapter_title_container = chapter_container.find(
                    "a", class_="content-title")
                if chapter_title_container:
                    # chapter_title = chapter_title_container.get_text()
                    chapter_link = chapter_title_container.get('href').replace(
                        constants.main_url + '/', '')
                    chapter_gid = chapter_link[1:] if chapter_link[
                        0] == '/' else chapter_link
                    gallery.magazine_chapters_gids.append(chapter_gid)

                tags_container = chapter_container.find(
                    "div", {"class": "tags"})

                for tag_a in tags_container.find_all(
                        "a", href=lambda x: x and '/tags/' in x):
                    tags_set.add(translate_tag(tag_a.get_text().strip()))

                artist = chapter_container.find(
                    "a", href=lambda x: x and '/artists/' in x)

                if artist:
                    artists_set.add("artist:" +
                                    translate_tag(artist.get_text().strip()))

            gallery.tags = list(tags_set)
            gallery.tags.extend(list(artists_set))

            return gallery
        else:
            return None
Ejemplo n.º 21
0
    def process_regular_gallery_page(
            self, link: str, response_text: str) -> Optional[GalleryData]:

        soup = BeautifulSoup(response_text, 'html.parser')
        gallery = GalleryData(link.replace(constants.main_url + '/', ''),
                              self.name)
        gallery.link = link
        gallery.tags = []
        title_container = soup.find("meta", property="og:title")
        gallery.title = title_container['content'] if title_container else ''

        # Defaulting to Doujinshi (might need to change later)
        gallery.category = 'Doujinshi'

        description_container = soup.find("script",
                                          id="ProductJson-product-template")

        # We can get most data from this object
        parsed_json = json.loads(description_container.string)

        # gallery.title = parsed_json.get('title', '')
        gallery.posted = date_parser.parse(
            parsed_json.get('published_at', None))

        thumbnail_url = parsed_json.get('featured_image', None)
        if thumbnail_url:
            gallery.thumbnail_url = urljoin(thumbnail_url,
                                            urlparse(thumbnail_url).path)
            if gallery.thumbnail_url and gallery.thumbnail_url.startswith(
                    '//'):
                gallery.thumbnail_url = 'https:' + gallery.thumbnail_url

        if 'vendor' in parsed_json:
            gallery.tags.append(
                translate_tag("artist:" + parsed_json['vendor']))

        if 'tags' in parsed_json:
            for tag in parsed_json['tags']:
                gallery.tags.append(translate_tag(tag))

        if 'description' in parsed_json:
            description_soup = BeautifulSoup(parsed_json['description'],
                                             'html.parser')

            description_text = description_soup.get_text().replace(
                "Synopsis:", "")
            description_text_newline = description_soup.get_text("\n").replace(
                "Synopsis:", "")

            description_text_remove = description_text

            description_text_remove = self.PAGES_REGEX.sub(
                '', description_text_remove)
            description_text_remove = self.JP_TITLE_REGEX.sub(
                '', description_text_remove)
            description_text_remove = self.DATE_CONVENTION_REGEX.sub(
                '', description_text_remove)

            other_language_found = self.OTHER_LANGUAGE_REGEX.search(
                description_text_newline)
            if other_language_found:
                gallery.tags.append(
                    translate_tag("language:" + other_language_found.group(1)))
            else:
                gallery.tags.append(translate_tag("language:english"))

            pages_text_found = self.PAGES_REGEX.search(
                description_text_newline)
            if pages_text_found:
                gallery.filecount = int(pages_text_found.group(1))

            jp_text_found = self.JP_TITLE_REGEX.search(
                description_text_newline)
            if jp_text_found:
                gallery.title_jpn = jp_text_found.group(1)

            convention_text_found = self.DATE_CONVENTION_REGEX.search(
                description_text_newline)
            if convention_text_found:
                gallery.tags.append(
                    translate_tag("event:" + convention_text_found.group(1)))

            reparsed_text = ''

            # roundabout way to remove all after the used regexs.
            for char_count, char in enumerate(description_text_remove):
                if description_text_remove[char_count] == description_text[
                        char_count]:
                    reparsed_text += char
                else:
                    break

            gallery.comment = reparsed_text.strip()

        return gallery