def get_book_info(soup): div = soup.find('div', {'class' : 'book_body'}) book_info = BookInfo() field = Retriever.get_field(div, u'Название:') if field: match = re.match(u"Название:(.+)", field.text) if match and match.groups(): book_info.title = match.groups(0)[0] #author field = Retriever.get_field(div, u'Автор:') book_info.authors = [ anchor.text for anchor in field.findAll('a')] if field else [] #summary field = Retriever.get_field(div, u'Описание книги:') if field and field.p: book_info.summary = field.p.text #images img = soup.find('img', {'class' : 'thumb'}) book_info.image = img['src'] if img else None #tags field = Retriever.get_field(div, u'Жанр:') if field: book_info.tags = [ tag.text for tag in field.findAll('a')] #links field = Retriever.get_field(div, u'Скачать книгу бесплатно:') if field: for link_anchor in field.findAll('a'): book_info.links [link_anchor.text] = link_anchor['href'] return book_info
def get_bookinfo(entry): book_info = BookInfo() book_info.title = entry.title.text book_info.authors = [entry.author.findChild().text] # language language = entry.find("dc:language") or entry.find("dcterms:language") book_info.language = language.text if language else "?" # links for format in Retriever.known_formats: link_to_book_tag = entry.find("link", type="application/" + format) if link_to_book_tag: book_info.links[format] = link_to_book_tag["href"] if not book_info.links: return None # summary summary = entry.find("content") or entry.find("summary") book_info.summary = summary.text if summary else None # tags categories = entry.findAll("category") if categories: for category in categories: label = None if category.has_key("label"): label = category["label"] elif category.has_key("term"): label = category["term"] if label: book_info.tags.append(label) return book_info
def get_book_info(soup): book_info = BookInfo() book_info.title = Retriever.get_field(soup, u'Название:') authors = Retriever.get_field(soup, u'Автор\(ы\):') if authors: book_info.authors = [ authors ] book_info.summary = Retriever.get_field(soup, u'Описание:') book_info.language = Retriever.get_field(soup, u'Язык:') format = Retriever.get_field(soup, u'Формат:') link = Retriever.get_field(soup, u'Ссылка 1:', False).a['href'] book_info.links = {format : link} return book_info
def get_bookinfo_from_tag_dl(dl): bookinfo = BookInfo() anchors = dl.dt.li.findAll("a") author_anchor = anchors[0] title_anchor = anchors[1] bookinfo.links["shtml"] = title_anchor["href"] bookinfo.authors = [author_anchor.text] bookinfo.title = title_anchor.text bookinfo.language = "ru" dd = dl.findAll("dd") if dd: bookinfo.summary = dd[0].text return bookinfo
def execute(self,html): soup = get_soup(html) book_info = BookInfo() book_info.title = self.description['title'] book_info.authors = [ "%s %s %s" % (self.description['firstname'],self.description['middlename'], self.description['lastname']) ] book_info.pagelink = self.link book_info.language = self.description['language'] book_info.summary = Retriever.get_summary(soup) book_info.links = Retriever.get_links(soup, self.link) book_info.tags = Retriever.get_tags(soup) book_info.image = Retriever.get_picture(soup, self.link, self.description['ID']) self.tasks = [ BookSavingTask(book_info) ] return True
def get_bookinfo_from_db(id): bookinfo = BookInfo() book = Book.objects.get(id=id) bookinfo.title = book.title bookinfo.authors = [ author.name for author in book.author.all() ] bookinfo.pagelink = book.pagelink bookinfo.language = book.language.short annotations = book.annotation_set.all() if annotations: bookinfo.summary = annotations[0] for bookfile in book.book_file.all(): bookinfo.links[bookfile.type] = bookfile.link bookinfo.tags = [ tag.name for tag in book.tag.all()] bookinfo.image = book.image.name if book.image else None return bookinfo