def _parse_search_result(self, result: Dict, generic_cover: str, locale: str) -> MetaRecord: match = MetaRecord( id=result["id"], title=result["volumeInfo"]["title"], authors=result["volumeInfo"].get("authors", []), url=Google.BOOK_URL + result["id"], source=MetaSourceInfo( id=self.__id__, description=Google.DESCRIPTION, link=Google.META_URL, ), ) match.cover = self._parse_cover(result=result, generic_cover=generic_cover) match.description = result["volumeInfo"].get("description", "") match.languages = self._parse_languages(result=result, locale=locale) match.publisher = result["volumeInfo"].get("publisher", "") match.publishedDate = result["volumeInfo"].get("publishedDate", "") match.rating = result["volumeInfo"].get("averageRating", 0) match.series, match.series_index = "", 1 match.tags = result["volumeInfo"].get("categories", []) match.identifiers = {"google": match.id} match = self._parse_isbn(result=result, match=match) return match
def parse_single_book(self, match: MetaRecord, generic_cover: str, locale: str) -> MetaRecord: response = requests.get(match.url) self.root = fromstring(response.text) match.cover = self._parse_cover(generic_cover=generic_cover) match.description = self._parse_description() match.languages = self._parse_languages(locale=locale) match.publisher = self._parse_publisher() match.publishedDate = self._parse_from_summary( attribute_name="datePublished") match.rating = self._parse_rating() match.series, match.series_index = self._parse_series() match.tags = self._parse_tags() match.identifiers = { "isbn": self._parse_isbn(), "lubimyczytac": match.id, } return match
def _parse_single_book(self, id: str, generic_cover: str = "") -> Optional[MetaRecord]: url = f"https://book.douban.com/subject/{id}/" try: r = self.session.get(url) r.raise_for_status() except Exception as e: log.warning(e) return None match = MetaRecord( id=id, title="", authors=[], url=url, source=MetaSourceInfo( id=self.__id__, description=self.DESCRIPTION, link=self.META_URL, ), ) html = etree.HTML(r.content.decode("utf8")) match.title = html.xpath(self.TITTLE_XPATH)[0].text match.cover = html.xpath( self.COVER_XPATH)[0].attrib["href"] or generic_cover try: rating_num = float(html.xpath(self.RATING_XPATH)[0].text.strip()) except Exception: rating_num = 0 match.rating = int(-1 * rating_num // 2 * -1) if rating_num else 0 tag_elements = html.xpath(self.TAGS_XPATH) if len(tag_elements): match.tags = [tag_element.text for tag_element in tag_elements] description_element = html.xpath(self.DESCRIPTION_XPATH) if len(description_element): match.description = html2text( etree.tostring(description_element[-1], encoding="utf8").decode("utf8")) info = html.xpath(self.INFO_XPATH) for element in info: text = element.text if self.AUTHORS_PATTERN.search(text): next = element.getnext() while next is not None and next.tag != "br": match.authors.append(next.text) next = next.getnext() elif self.PUBLISHER_PATTERN.search(text): match.publisher = element.tail.strip() elif self.SUBTITLE_PATTERN.search(text): match.title = f'{match.title}:' + element.tail.strip() elif self.PUBLISHED_DATE_PATTERN.search(text): match.publishedDate = self._clean_date(element.tail.strip()) elif self.SUBTITLE_PATTERN.search(text): match.series = element.getnext().text elif i_type := self.IDENTIFIERS_PATTERN.search(text): match.identifiers[i_type.group()] = element.tail.strip()