Ejemplo n.º 1
0
    def get_manga_info_from_url(self, url: str) -> Optional[Manga]:
        """
        Gets the list of all volumes from a manga presentation page url

        Args:
            url (string): url of the manga

        Returns:
            manga filled with retrieved volumes and chapters
            None (None): None if there is an error
        """

        def is_float(string):
            try:
                float(string)
                return True
            except ValueError:
                return False

        def is_int(_float):
            try:
                int(_float)
                return True
            except ValueError:
                return False

        soup = self.get_soup(url)
        if soup is None:
            return None

        retrieved_chapters_list: List[Chapter] = []
        try:
            name = soup.find("h2", {"class": "widget-title"}).text.strip()
            synopsis = soup.find("div", {"class": "well"}).find("p").text

            raw_chapters_list = soup.find_all("h5", {"class": "chapter-title-rtl"})
            for raw_chapter in raw_chapters_list:
                retrieved_chapter = Chapter(name=raw_chapter.find("em").text,
                                            link=raw_chapter.find("a")["href"])
                number_raw_chaper = raw_chapter.find("a").text
                list_number = [float(s) for s in number_raw_chaper.split() if is_float(s)]
                list_number = [int(s) if is_int(s) else s for s in list_number]
                retrieved_chapter.number = list_number[-1]

                retrieved_chapters_list.append(retrieved_chapter)

        except Exception as e:
            self.print_v("Impossible to get the correct tags from the soup from the page ", url, ": ", str(e))
            return None

        retrieved_manga = Manga(name=name, link=url, synopsis=synopsis)
        retrieved_manga.add_chapters_without_volume(retrieved_chapters_list)

        return retrieved_manga
Ejemplo n.º 2
0
    def get_info_from_chapter_url(self, url: str):
        """Takes the url of a chapter, and returns a set of valuable infos
            Args:
                url (string): url of the chapter
            Returns:
                chapter (Chapter): chapters
                None (None): None if there is an error
            Raises:
                Doesn't raise an error. print a warning with self.print_v().

            Examples:
                >>> e = EngineLelscan()
                >>> e.get_info_from_chapter_url("https://www.lelscan-vf.com/manga/shingeki-no-kyojin/22")
                >>> output: {'manga_title': 'Shingeki No Kyojin', 'chapter_num': '22', 'max_pages': 44, 'pages': [{'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/001.jpg ', 'num': 1}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/002.jpg ', 'num': 2}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/003.jpg ', 'num': 3}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/004.jpg ', 'num': 4}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/005.jpg ', 'num': 5}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/006.jpg ', 'num': 6}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/007.jpg ', 'num': 7}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/008.jpg ', 'num': 8}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/009.jpg ', 'num': 9}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/010.jpg ', 'num': 10}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/011.jpg ', 'num': 11}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/012.jpg ', 'num': 12}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/013.jpg ', 'num': 13}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/014.jpg ', 'num': 14}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/015.jpg ', 'num': 15}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/016.jpg ', 'num': 16}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/017.jpg ', 'num': 17}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/018.jpg ', 'num': 18}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/019.jpg ', 'num': 19}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/020.jpg ', 'num': 20}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/021.jpg ', 'num': 21}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/022.jpg ', 'num': 22}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/023.jpg ', 'num': 23}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/024.jpg ', 'num': 24}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/025.jpg ', 'num': 25}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/026.jpg ', 'num': 26}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/027.jpg ', 'num': 27}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/028.jpg ', 'num': 28}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/029.jpg ', 'num': 29}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/030.jpg ', 'num': 30}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/031.jpg ', 'num': 31}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/032.jpg ', 'num': 32}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/033.jpg ', 'num': 33}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/034.jpg ', 'num': 34}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/035.jpg ', 'num': 35}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/036.jpg ', 'num': 36}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/037.jpg ', 'num': 37}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/038.jpg ', 'num': 38}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/039.jpg ', 'num': 39}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/040.jpg ', 'num': 40}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/041.jpg ', 'num': 41}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/042.jpg ', 'num': 42}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/043.jpg ', 'num': 43}, {'link': ' https://www.lelscan-vf.com/uploads/manga/shingeki-no-kyojin/chapters/0022/044.jpg ', 'num': 44}]}
        """

        soup = self.get_soup(url)
        if soup is None:
            return None

        # if the chapter is missing, quits
        if not self.verify_missing_chapter(soup):
            return None

        try: # Some blank pages can still pass
            manga_title = soup.find("img", {"class": "scan-page"})["alt"].split(":")[0].strip()
            list_number_page = [int(opt["value"]) for opt in soup.find_all("option") if "value" in opt.attrs]
            max_page = max(list_number_page)
        except Exception as e:
            self.print_v("Impossible to get 'img' and 'alt' fields in the soup from this url ", url, ": ", str(e))
            return None

        # Get the pictures links
        try:
            images_link = [img["data-src"] for img in soup.find_all("img", {"class": "img-responsive"}) if "data-src" in img.attrs]
        except Exception as e:
            self.print_v("Impossible to extract images link from the soup at the url ", url, ": ", str(e))
            return None

        if len(images_link) != len(list_number_page):
            self.print_v("Error, the number of pictures in the page doesn't match with the number of links, ", url, ": ")
            return None

        pages_list = []
        for i in range(len(images_link)):
            page = Page(number=list_number_page[i], link=images_link[i])
            pages_list.append(page)

        chapter_num = url.rsplit("/", 1)[-1]

        retrieved_chapter = Chapter()

        retrieved_chapter.manga_name = manga_title
        retrieved_chapter.number = chapter_num
        retrieved_chapter.number_page = max_page
        retrieved_chapter.add_pages(pages_list)

        return retrieved_chapter
Ejemplo n.º 3
0
    def get_info_from_chapter_url(self, url: str) -> Optional[Chapter]:
        """Takes the url of a chapter, and returns a set of valuable infos
            Args:
                url (string): url of the chapter
            Returns:
                chapter (Chapter): chapters
                None (None): None if there is an error
            Raises:
                Doesn't raise an error. print a warning with self.print_v().
         """

        soup = self.get_soup(url)
        if soup is None:
            return None

        try:  # Some blank pages can still pass
            manga_title = soup.find("img", {"class": "scan-page"})["alt"].split(":")[0].strip()
            list_number_page = [int(opt["value"]) for opt in soup.find_all("option") if "value" in opt.attrs]
            max_page = max(list_number_page)
        except Exception as e:
            self.print_v("Impossible to get 'img' and 'alt' fields in the soup from this url ", url, ": ", str(e))
            return None

        # Get the pictures links
        try:
            images_link = [img["data-src"] for img in soup.find_all("img", {"class": "img-responsive"}) if "data-src" in img.attrs]
        except Exception as e:
            self.print_v("Impossible to extract images link from the soup at the url ", url, ": ", str(e))
            return None

        if len(images_link) != len(list_number_page):
            self.print_v("Error, the number of pictures in the page doesn't match with the number of links, ", url, ": ")
            return None

        pages_list: List[Page] = []
        for i in range(len(images_link)):
            page = Page(number=list_number_page[i], link=images_link[i])
            pages_list.append(page)

        chapter_num = url.rsplit("/", 1)[-1]

        retrieved_chapter = Chapter()

        retrieved_chapter.manga_name = manga_title
        retrieved_chapter.number = chapter_num
        retrieved_chapter.number_page = max_page
        retrieved_chapter.add_pages(pages_list)

        return retrieved_chapter
Ejemplo n.º 4
0
    def get_manga_info_from_url(self, url) -> Optional[Manga]:
        """
        Gets the list of all volumes from a manga presentation page url

        Args:
            url (string): url of the manga

        Returns:
            manga filled with retrieved volumes and chapters
            None (None): None if there is an error
        """

        soup = self.get_soup(url)
        if soup is None:
            return None

        retrieved_chapters_list: List[Chapter] = []
        retrieved_volumes_list: List[Volume] = []
        retrieved_volumes_dict = {}
        try:
            pattern = re.compile(r'Description')
            manga_name = soup.find("span", {"class": "mx-1"}).text
            synopsis = (soup.find("div", {"class": "col-lg-3 col-xl-2 strong"}, text=pattern)) \
                .parent.find("div", {"class": "col-lg-9 col-xl-10"}).text
            max_chapter = int(
                soup.find("li", {
                    "class": "page-item paging"
                }).find("a")["href"].split("/")[-2])

            for nb_page in range(1, max_chapter + 1):
                page_url = url + "/" + "chapters/" + str(nb_page)
                soup = self.get_soup(page_url)

                chapter_field_in_page = soup.find_all(
                    "a", {"class": "text-truncate"})
                for chapter_field in chapter_field_in_page:

                    text_field = chapter_field.text
                    elements = text_field.split("-")

                    retrieved_chapter = Chapter(name=elements[-1].strip(),
                                                link=self.url_manga +
                                                chapter_field["href"])

                    raw_number = elements[0]

                    if "Vol." in raw_number:
                        string = raw_number.split("Vol.")[1].split("Ch. ")
                        volume_number = int(string[0].strip())
                        chapter_number = int(string[1].strip())
                    elif "Ch. " in raw_number:
                        volume_number = 0
                        chapter_number = int(
                            raw_number.split("Ch. ")[1].strip())
                    else:
                        print(raw_number, " couldn't be parsed")
                        continue
                    retrieved_chapter.number = chapter_number

                    if volume_number not in retrieved_volumes_dict:
                        retrieved_volumes_dict[volume_number] = []
                    retrieved_volumes_dict[volume_number].append(
                        retrieved_chapter)

        except Exception as e:
            self.print_v(
                "Impossible to get the correct tags from the soup from the page ",
                url, ": ", str(e))
            return None

        # Now we put all the chapters in their volume
        for volume_number in retrieved_volumes_dict:
            if (volume_number == 0):
                for retrieved_chapter in retrieved_volumes_dict[0]:
                    retrieved_chapters_list.append(retrieved_chapter)
            else:
                retrieved_volume = Volume(number=volume_number)
                for chapter in retrieved_volumes_dict[volume_number]:
                    retrieved_volume.add_chapter(chapter)
                retrieved_volumes_list.append(retrieved_volume)

        retrieved_manga = Manga(name=manga_name, link=url, synopsis=synopsis)
        retrieved_manga.add_chapters_without_volume(retrieved_chapters_list)
        for retrieved_volume in retrieved_volumes_list:
            retrieved_manga.add_volume(retrieved_volume)

        return retrieved_manga
Ejemplo n.º 5
0
    def get_manga_info_from_url(self, url: str) -> Optional[Manga]:
        """
        a manga object

        Args:
            url (string): url of the manga

        Returns:
            manga filled with retrieved volumes and chapters
            None (None): None if there is an error
        """
        def is_float(string):
            try:
                float(string)
                return True
            except ValueError:
                return False

        def is_int(_float):
            try:
                int(_float)
                return True
            except ValueError:
                return False

        soup = self.get_soup(url)
        if soup is None:
            return None

        retrieved_chapters_list: List[Chapter] = []
        retrieved_volumes_list: List[Volume] = []
        retrieved_volumes_dict = {}
        try:
            name = soup.find("h2", {"class": "widget-title"}).text.strip()
            synopsis = soup.find("div", {"class": "well"}).find("p").text

            #raw_chapters_list = soup.find_all("h5", {"class": "chapter-title-rtl"})

            volume_re = re.compile(r'volume\-') # regex
            # find chapters containing the class volume-numberOfTheVolume
            raw_chapters_list = soup.find_all("li", {"class": volume_re})

            # we group chapters with the same volume under the same dict key
            for raw_chapter in raw_chapters_list:
                raw_volume_number = raw_chapter["class"][0]
                if raw_volume_number not in retrieved_volumes_dict:
                    retrieved_volumes_dict[raw_volume_number] = []
                retrieved_volumes_dict[raw_volume_number].append(raw_chapter)

            # now, we can go through the dictionary and save chapters in their own volume
            for raw_volume_label in retrieved_volumes_dict:
                # for all chapters in the same volume
                volume_number = int(raw_volume_label.split("-")[-1])
                retrieved_volume = Volume(number=volume_number)

                for raw_chapter in retrieved_volumes_dict[raw_volume_label]:
                    raw_chapter = raw_chapter.find("h5", {"class": "chapter-title-rtl"}) # we extract the surrounding info
                    retrieved_chapter = Chapter(name=raw_chapter.find("em").text,
                                                link=raw_chapter.find("a")["href"])
                    number_raw_chapter = raw_chapter.find("a").text
                    list_number = [float(s) for s in number_raw_chapter.split() if is_float(s)]
                    list_number = [int(s) if is_int(s) else s for s in list_number]
                    retrieved_chapter.number = list_number[-1]

                    if (volume_number == 0): # if chapter without volume
                        retrieved_chapters_list.append(retrieved_chapter)
                    else:
                        retrieved_volume.add_chapter(retrieved_chapter)

                if (volume_number != 0):
                    retrieved_volumes_list.append(retrieved_volume)

        except Exception as e:
            self.print_v("Impossible to get the correct tags from the soup from the page ", url, ": ", str(e))
            return None

        # there is no volume/chapter separation on scan op, so fill a default volume with number -1 with chapters
        # retrieved_volume = Volume()
        # retrieved_volume.add_chapters(retrieved_chapters_list)

        retrieved_manga = Manga(name=name, link=url, synopsis=synopsis)
        retrieved_manga.add_chapters_without_volume(retrieved_chapters_list)

        for retrieved_volume in retrieved_volumes_list:
            retrieved_manga.add_volume(retrieved_volume)

        print(len(retrieved_manga.chapters_without_volumes_list))
        print(retrieved_manga.chapters_without_volumes_list)
        print(len(retrieved_manga.volumes_list))
        print(retrieved_manga.volumes_list)

        return retrieved_manga
Ejemplo n.º 6
0
    def get_manga_info_from_url(self, url: str) -> Optional[Manga]:
        """
        Gets the list of all volumes from a manga url

        Args:
            url (string): url of the manga

        Returns:
            manga filled with retrieved volumes and chapters
            None (None): None if there is an error
        """

        # We get the soup related to the url
        soup = self.get_soup(url)
        if soup is None:
            return None

        # We look for a specific tag in the page
        try:
            raw_page_list = soup.find_all("option", {"rel": "bookmark"})
        except Exception as e:
            self.print_v(
                "impossible to fin the 'option' tag in the webpage from ", url,
                ": ", str(e))
            return None

        # we extract titles, number of the volume....
        retrieved_chapters_list: List[Chapter] = []
        number_re = re.compile(r"([\d]+[\.,]?[\d]?)")
        try:
            for page in raw_page_list:

                try:
                    retrieved_chapter = Chapter(name=page.text.strip(),
                                                link=page["value"])
                    title = page.text.strip()
                    # get rid of , converted to dot
                    raw_title = title.replace(",", ".")

                    # We need to handle decimal valued chapters
                    regex_search = number_re.search(raw_title)

                    chapter_number_raw = regex_search.group()

                    if float(chapter_number_raw) == int(chapter_number_raw):
                        chapter_number = int(chapter_number_raw)
                    else:
                        chapter_number = float(chapter_number_raw)
                    retrieved_chapter.number = chapter_number

                    retrieved_chapters_list.append(retrieved_chapter)
                except Exception as e:
                    self.print_v("error with " + str(page) +
                                 " due to error :" + str(e))
                    continue

        except Exception as e:
            self.print_v(
                "Impossible to get proper numbers from the html page ", url,
                ": ", str(e))
            return None

        retrieved_chapters_list = sorted(retrieved_chapters_list,
                                         key=lambda manga: float(manga.number))

        # We need the correct title as mentioned in the page
        try:
            name = soup.find("h1").text.strip()

        except Exception as e:
            self.print_v(
                "impossible to find the 'h1' tag for title in th page ", url,
                ": ", str(e))
            return None

        retrieved_manga = Manga(name=name, link=url,
                                synopsis="")  # TODO: get synopsis
        retrieved_manga.add_chapters_without_volume(retrieved_chapters_list)

        return retrieved_manga
Ejemplo n.º 7
0
    def get_info_from_chapter_url(self, url: str) -> Optional[Chapter]:
        """Takes the url of a chapter, and returns a set of valuable infos
            Args:
                url (string): url of the chapter
            Returns:
                chapter (Chapter): chapters
                None (None): None if there is an error
            Raises:
                Doesn't raise an error. print a warning with self.print_v().
         """

        soup = self.get_soup(url)
        if soup is None:
            return None

        # We extract all img fields.
        try:
            found_pages_list = soup.find_all("img",
                                             {"class": "lozad lazyload"})
        except Exception as e:
            self.print_v("Impossible to get the 'img' tag in the page ", url,
                         ": ", str(e))
            return None

        # We try with a little hack to get the ?view page avoiding the adds
        if found_pages_list == []:
            self.print_v("error, nothing found in the page",
                         url + " . I will try another time")
            url += "?view"
            soup = self.get_soup(url)
            if soup is None:
                return None
            try:
                found_pages_list = soup.find_all("img",
                                                 {"class": "lozad lazyload"})
            except Exception as e:
                self.print_v("Impossible to get the 'img' tag in the page ",
                             url, ": ", str(e))
                return None

            if found_pages_list == []:
                self.print_v(
                    "error, nothing found in the page even adding ?view")
                return None

        # We create the list of pages that are linked to this chapter
        pages_list: List[Page] = []
        try:
            for page in found_pages_list:
                name = page["alt"]
                index = name.find("Page")
                num = int(name[index + len("Page"):])
                if page["data-src"][:8] == "https://":
                    link = page["data-src"]
                else:
                    link = self.url_root + page["data-src"]

                retrieved_page = Page(number=num, link=link)

                pages_list.append(retrieved_page)

        except Exception as e:
            self.print_v(
                "Impossible to get the 'alt' or 'Page' or 'link' tag in the page ",
                url, ": ", str(e))
            return None

        # We get general manga info as name, number of the chapter, manga_title, number max of pages
        first_page = found_pages_list[0]
        try:
            name = first_page["alt"]
            index = name.find("Chapter")
            chapter_num = int(name[index + len("chapter"):].strip().split()[0])
            manga_title = name.split(":")[0].strip()
            max_page = int(pages_list[-1].number)

        except Exception as e:
            self.print_v("Impossible to get tags 'alt' or 'Chapter'", ": ",
                         str(e))
            return None

        retrieved_chapter = Chapter()
        retrieved_chapter.manga_name = manga_title
        retrieved_chapter.number = chapter_num
        retrieved_chapter.number_page = max_page
        retrieved_chapter.add_pages(pages_list)

        return retrieved_chapter