Ejemplo n.º 1
0
 def _parse_search_result(
     self, result: Dict, generic_cover: str, locale: str
 ) -> MetaRecord:
     series = result["volume"].get("name", "")
     series_index = result.get("issue_number", 0)
     issue_name = result.get("name", "")
     match = MetaRecord(
         id=result["id"],
         title=f"{series}#{series_index} - {issue_name}",
         authors=result.get("authors", []),
         url=result.get("site_detail_url", ""),
         source=MetaSourceInfo(
             id=self.__id__,
             description=ComicVine.DESCRIPTION,
             link=ComicVine.META_URL,
         ),
         series=series,
     )
     match.cover = result["image"].get("original_url", generic_cover)
     match.description = result.get("description", "")
     match.publishedDate = result.get("store_date", result.get("date_added"))
     match.series_index = series_index
     match.tags = ["Comics", series]
     match.identifiers = {"comicvine": match.id}
     return match
Ejemplo n.º 2
0
 def _parse_isbn(result: Dict, match: MetaRecord) -> MetaRecord:
     identifiers = result["volumeInfo"].get("industryIdentifiers", [])
     for identifier in identifiers:
         if identifier.get("type") == Google.ISBN_TYPE:
             match.identifiers["isbn"] = identifier.get("identifier")
             break
     return match
Ejemplo n.º 3
0
 def parse_single_book(self, match: MetaRecord, generic_cover: str,
                       locale: str) -> MetaRecord:
     response = requests.get(match.url)
     self.root = fromstring(response.text)
     match.cover = self._parse_cover(generic_cover=generic_cover)
     match.description = self._parse_description()
     match.languages = self._parse_languages(locale=locale)
     match.publisher = self._parse_publisher()
     match.publishedDate = self._parse_from_summary(
         attribute_name="datePublished")
     match.rating = self._parse_rating()
     match.series, match.series_index = self._parse_series()
     match.tags = self._parse_tags()
     match.identifiers = {
         "isbn": self._parse_isbn(),
         "lubimyczytac": match.id,
     }
     return match
Ejemplo n.º 4
0
    def _parse_search_result(
        self, result: Dict, generic_cover: str, locale: str
    ) -> MetaRecord:
        match = MetaRecord(
            id=result.get("pub_url", result.get("eprint_url", "")),
            title=result["bib"].get("title"),
            authors=result["bib"].get("author", []),
            url=result.get("pub_url", result.get("eprint_url", "")),
            source=MetaSourceInfo(
                id=self.__id__, description=self.__name__, link=scholar.META_URL
            ),
        )

        match.cover = result.get("image", {}).get("original_url", generic_cover)
        match.description = unquote(result["bib"].get("abstract", ""))
        match.publisher = result["bib"].get("venue", "")
        match.publishedDate = result["bib"].get("pub_year") + "-01-01"
        match.identifiers = {"scholar": match.id}
        return match
Ejemplo n.º 5
0
    def parse_search_results(self) -> List[MetaRecord]:
        matches = []
        results = self.root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
        for result in results:
            title = self._parse_xpath_node(
                root=result,
                xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
                f"{LubimyCzytac.TITLE_TEXT_PATH}",
            )

            book_url = self._parse_xpath_node(
                root=result,
                xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
                f"{LubimyCzytac.URL_PATH}",
            )
            authors = self._parse_xpath_node(
                root=result,
                xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
                f"{LubimyCzytac.AUTHORS_PATH}",
                take_first=False,
            )
            if not all([title, book_url, authors]):
                continue
            matches.append(
                MetaRecord(
                    id=book_url.replace(f"/ksiazka/", "").split("/")[0],
                    title=title,
                    authors=[strip_accents(author) for author in authors],
                    url=LubimyCzytac.BASE_URL + book_url,
                    source=MetaSourceInfo(
                        id=self.metadata.__id__,
                        description=self.metadata.__name__,
                        link=LubimyCzytac.BASE_URL,
                    ),
                )
            )
        return matches
Ejemplo n.º 6
0
    def _parse_search_result(self, result: Dict, generic_cover: str,
                             locale: str) -> MetaRecord:
        match = MetaRecord(
            id=result["id"],
            title=result["volumeInfo"]["title"],
            authors=result["volumeInfo"].get("authors", []),
            url=Google.BOOK_URL + result["id"],
            source=MetaSourceInfo(
                id=self.__id__,
                description=Google.DESCRIPTION,
                link=Google.META_URL,
            ),
        )

        match.cover = self._parse_cover(result=result,
                                        generic_cover=generic_cover)
        match.description = result["volumeInfo"].get("description", "")
        match.languages = self._parse_languages(result=result, locale=locale)
        match.publisher = result["volumeInfo"].get("publisher", "")
        match.publishedDate = result["volumeInfo"].get("publishedDate", "")
        match.rating = result["volumeInfo"].get("averageRating", 0)
        match.series, match.series_index = "", 1
        match.tags = result["volumeInfo"].get("categories", [])

        match.identifiers = {"google": match.id}
        match = self._parse_isbn(result=result, match=match)
        return match
Ejemplo n.º 7
0
        def inner(link, index) -> [dict, int]:
            with self.session as session:
                try:
                    r = session.get(f"https://www.amazon.com/{link}")
                    r.raise_for_status()
                except Exception as ex:
                    log.warning(ex)
                    return
                long_soup = BS(r.text, "lxml")  #~4sec :/
                soup2 = long_soup.find(
                    "div",
                    attrs={
                        "cel_widget_id":
                        "dpx-books-ppd_csm_instrumentation_wrapper"
                    })
                if soup2 is None:
                    return
                try:
                    match = MetaRecord(
                        title="",
                        authors="",
                        source=MetaSourceInfo(id=self.__id__,
                                              description="Amazon Books",
                                              link="https://amazon.com/"),
                        url=f"https://www.amazon.com{link}",
                        #the more searches the slower, these are too hard to find in reasonable time or might not even exist
                        publisher="",  # very unreliable
                        publishedDate="",  # very unreliable
                        id=None,  # ?
                        tags=[]  # dont exist on amazon
                    )

                    try:
                        match.description = "\n".join(
                            soup2.find("div", attrs={"data-feature-name": "bookDescription"}).stripped_strings)\
                                                .replace("\xa0"," ")[:-9].strip().strip("\n")
                    except (AttributeError, TypeError):
                        return None  # if there is no description it is not a book and therefore should be ignored
                    try:
                        match.title = soup2.find("span",
                                                 attrs={
                                                     "id": "productTitle"
                                                 }).text
                    except (AttributeError, TypeError):
                        match.title = ""
                    try:
                        match.authors = [
                            next(
                                filter(
                                    lambda i: i != " " and i != "\n"
                                    and not i.startswith("{"),
                                    x.findAll(text=True))).strip()
                            for x in soup2.findAll("span",
                                                   attrs={"class": "author"})
                        ]
                    except (AttributeError, TypeError, StopIteration):
                        match.authors = ""
                    try:
                        match.rating = int(
                            soup2.find("span",
                                       class_="a-icon-alt").text.split(" ")
                            [0].split(".")[0])  # first number in string
                    except (AttributeError, ValueError):
                        match.rating = 0
                    try:
                        match.cover = soup2.find(
                            "img",
                            attrs={"class":
                                   "a-dynamic-image frontImage"})["src"]
                    except (AttributeError, TypeError):
                        match.cover = ""
                    return match, index
                except Exception as e:
                    log.error_or_exception(e)
                    return
Ejemplo n.º 8
0
    def _parse_single_book(self,
                           id: str,
                           generic_cover: str = "") -> Optional[MetaRecord]:
        url = f"https://book.douban.com/subject/{id}/"

        try:
            r = self.session.get(url)
            r.raise_for_status()
        except Exception as e:
            log.warning(e)
            return None

        match = MetaRecord(
            id=id,
            title="",
            authors=[],
            url=url,
            source=MetaSourceInfo(
                id=self.__id__,
                description=self.DESCRIPTION,
                link=self.META_URL,
            ),
        )

        html = etree.HTML(r.content.decode("utf8"))

        match.title = html.xpath(self.TITTLE_XPATH)[0].text
        match.cover = html.xpath(
            self.COVER_XPATH)[0].attrib["href"] or generic_cover
        try:
            rating_num = float(html.xpath(self.RATING_XPATH)[0].text.strip())
        except Exception:
            rating_num = 0
        match.rating = int(-1 * rating_num // 2 * -1) if rating_num else 0

        tag_elements = html.xpath(self.TAGS_XPATH)
        if len(tag_elements):
            match.tags = [tag_element.text for tag_element in tag_elements]

        description_element = html.xpath(self.DESCRIPTION_XPATH)
        if len(description_element):
            match.description = html2text(
                etree.tostring(description_element[-1],
                               encoding="utf8").decode("utf8"))

        info = html.xpath(self.INFO_XPATH)

        for element in info:
            text = element.text
            if self.AUTHORS_PATTERN.search(text):
                next = element.getnext()
                while next is not None and next.tag != "br":
                    match.authors.append(next.text)
                    next = next.getnext()
            elif self.PUBLISHER_PATTERN.search(text):
                match.publisher = element.tail.strip()
            elif self.SUBTITLE_PATTERN.search(text):
                match.title = f'{match.title}:' + element.tail.strip()
            elif self.PUBLISHED_DATE_PATTERN.search(text):
                match.publishedDate = self._clean_date(element.tail.strip())
            elif self.SUBTITLE_PATTERN.search(text):
                match.series = element.getnext().text
            elif i_type := self.IDENTIFIERS_PATTERN.search(text):
                match.identifiers[i_type.group()] = element.tail.strip()