def execute(self, html): soup = get_soup(html) genres = Retriever.get_genres(soup) for genre in genres: genre['link'] = make_correct_link(self.link, genre['link']) self.tasks = [ GetAllPagesGenre(g['link'],g['name']) for g in genres ] return True
def execute(self, html): soup = get_soup(html) dirs = Retriever.get_accept_dirs(soup, self.link) books = Retriever.get_accept_books(soup, self.link) self.tasks = [ DirPage( link ) for link,_ in dirs ] self.tasks += [ BookPage( link+'_Ascii.txt' ) for link,_ in books ] return True
def execute(self, html): soup = get_soup(html) magazine_info = Retriever.get_magazine_info(soup) magazine_info.cover = make_correct_link(self.link, magazine_info.cover) magazine_info.link = self.link issue_links = [ make_correct_link(self.link, link) for link in Retriever.get_links_magazine_issue(soup) ] self.tasks = [ MagazineIssueTask( magazine_info.name, issue_link) for issue_link in issue_links ] self.tasks += [ MagazineSavingTask( magazine_info ) ] return True
def execute(self, html): soup = get_soup(html) year, number = Retriever.get_issue_year_number_by_link(self.link) articles = Retriever.get_issue_articles(soup) for article in articles: article.magazine = self.magazine article.year = year article.number = number article.link = make_correct_link(self.link, article.link) article.link = Retriever.print_version(article.link) self.tasks = [ArticleSavingTask(article) for article in articles] return True
def execute(self,html): soup = get_soup(html) book_info = BookInfo() book_info.title = self.description['title'] book_info.authors = [ "%s %s %s" % (self.description['firstname'],self.description['middlename'], self.description['lastname']) ] book_info.pagelink = self.link book_info.language = self.description['language'] book_info.summary = Retriever.get_summary(soup) book_info.links = Retriever.get_links(soup, self.link) book_info.tags = Retriever.get_tags(soup) book_info.image = Retriever.get_picture(soup, self.link, self.description['ID']) self.tasks = [ BookSavingTask(book_info) ] return True
def get_tag_by_link(link): url = helpers.get_url_from_link(link).strip('/').split('/')[0] if not Retriever.TAGS_MAPPING: dm = WaitingDM() print 'downloading main page of LibRu for retrieving tags...' html = dm.download( helpers.get_site_root_link(link) ) soup = get_soup(html) dirs = Retriever.get_dirs(soup) for link, tag in dirs: Retriever.TAGS_MAPPING[link.strip('/')] = tag if not Retriever.TAGS_MAPPING.has_key(url): #TODO make other way for retrieving of tags for this case logger.write_fail("LibRu parser: can't find tag in main page",link=link, url=url) return None return Retriever.TAGS_MAPPING[url]
def execute(self, html): soup = get_soup(html) links_magazines = Retriever.get_links_on_magazines(soup) links_magazines = [make_correct_link(self.link, link) for link in links_magazines] self.tasks = [ MagazineInfoTask( link ) for link in links_magazines] return True
def execute(self,html): soup = get_soup(html) book_info = Retriever.correct_book_info( Retriever.get_book_info(soup), self.link) self.tasks = [ BookSavingTask(book_info) ] return True
def execute(self,html): soup = get_soup(html) links = [make_correct_link(self.link, book_link) for book_link in Retriever.get_book_links(soup) ] self.tasks = [ GetBookInfoTask( link) for link in links] return True
def execute(self,html): soup = get_soup(html) links = [ self.link ] links += Retriever.get_link_pages(soup, self.link) self.tasks = [ BookListingTask(link) for link in links ] return True
def execute(self,html): soup = get_soup(html) tasks = [ GenreTask( make_correct_link(self.link, glink)) for glink in Retriever.get_genres_links(soup) ] self.tasks = tasks return True
def execute(self,html): soup = get_soup(html) bookinfos = [ Retriever.postprocess_bookinfo(raw_bookinfo,self.link, self.genre) \ for raw_bookinfo in Retriever.get_bookinfos(soup) ] self.tasks = [ BookSavingTask(book_info) for book_info in bookinfos ] return True
def execute(self,html): soup = get_soup(html) number = Retriever.get_pages_number(soup) pagelinks = Retriever.generate_all_pagelinks_for_genre(self.link, number) self.tasks = [ GenrePage(link, self.genre) for link in pagelinks] return True