def _get_book_subtitle(self, content): """ args: content (requests.get): content is needed in order to scrape the book's subtitle returns: subtitle (String): the subtitle of the book that is being scraped. synopsis: The purpose of this function is to determine what the book's subtitle is. """ try: subtitle = Par_Scrape.parse(content, ("//*[@class='subtitle']/text()")) # Process here is to compensate for google books placement of the text(). # if the text is in a deeper <div> tag it will be able to access it. if len(subtitle) == 0: subtitle = Par_Scrape.parse(content, ("//*[@class='subtitle']/span[@dir='ltr']/text()")) if len(subtitle) == 0: return None true_subtitle = "" for part in subtitle: true_subtitle += part return true_subtitle except: return None
def _get_book_isbn_13(self, content): """ args: content (requests.get): content is needed in order to scrape the book's isbn_13. returns: isbn_13 (String): isbn_13 is the book's isbn_13 that is being scraped. synopsis: The purpose of this function is to determine the book's isbn_13. """ try: data = Par_Scrape.parse(content, (self.content_table + "/tr[@class='metadata_row']/td[@class='metadata_label']/span[contains(text(), 'ISBN')]" + "/../following-sibling::td/span/text()")) # Process here is to compensate for google books displaying both isbn_10 # and isbn_13. Seperates the two and only returns the isbn_13 by length compare isbn_13 = [x.strip() for x in data[0].split(',')] for x in isbn_13: if len(x) == 13: return x return None except: return None
def _get_book_description(self, content): """ args: content (requests.get): content is needed in orde to scrape the book's description returns: description (String): description is the book's description that is being scraped. synopsis: The purpose of this function is to determine what the book's description """ try: description_elements = Par_Scrape.parse(content,".//div[@class='right']/p")[1:] description = "" for i in range(len(description_elements)): if not description_elements[i].text is None: description += description_elements[i].text + ' ' soup = BeautifulSoup(description, features='lxml') description_text = soup.get_text() return description_text.strip() except: return None
def _form_search_submission(self, search): """ args: search (String): This is the parameter that will be searched for in the bookstore returns: link (String): This is the link that was generated based upon the search parameter None synopsis: The purpose of this function is to check whether or not a search link is valid. If it is, then return the link, otherwise return None. """ time.sleep(.3) br = mechanize.Browser() br.set_handle_robots(False) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] response = br.open("https://books.google.com/") br.form = list(br.forms())[0] control = br.form.controls[1] if control.type != "text": return None control.value = search br.submit() link = br.geturl() test_validity = requests.get(link) returned = Par_Scrape.parse(test_validity.content, "//span[@class='JZCD0c r0bn4c rQMQod']") if len(returned) != 0: return None return link
def _get_book_description(self, content): """ args: content (requests.get): content is needed in orde to scrape the book's description returns: description (String): description is the book's description that is being scraped. synopsis: The purpose of this function is to determine what the book's description """ try: description = Par_Scrape.parse(content, self.content_table + "//div[@class='book-description']/p/text()") if len(description) > 1: true_description = "" # removal of HTML. Does miss some '\' characters for part in description: soup = BeautifulSoup(part, features='lxml') text_part = soup.getText() if text_part != '': true_description += text_part + " " return true_description return description[0] except: return None
def _get_book_authors(self, content): """ args: content (requests.get): content is needed in order to get the authors names. returns: authors (String): authors is the book's authors synopsis: The purpose of this function is to determine what the authors are for the book being scraped. """ try: authors = Par_Scrape.parse(content, self.content_table + "//h4[@class='book-written-by']//a/text()") if len(authors) > 1: all_authors = "" for writer in authors: if writer == authors[-1]: all_authors += writer else: all_authors += writer + ", " return all_authors # purpose of return format is due to object list incase more than one author. return authors[0] except: return None
def __get_book_description(self, content): """ args: content (requests.get): content is needed in orde to scrape the book's description returns: description (String): description is the book's description that is being scraped. synopsis: The purpose of this function is to determine what the book's description """ try: return Par_Scrape.parse(content, ( self.meta_Father_Type + "/following-sibling::meta[@property='og:description']/@content" ))[0].replace("\n", "").replace("\t", "").replace("\'", "").replace("\xa0", " ") except: return None
def _get_book_authors(self, content): """ args: content (requests.get): content is needed in order to get the authors names. returns: authors (String): authors is the book's authors synopsis: The purpose of this function is to determine what the authors are for the book being scraped. """ try: authors = Par_Scrape.parse(content, (self.content_table + "/tr[@class='metadata_row']/td[@class='metadata_label']/span[contains(text(), 'Author')]" + "/../following-sibling::td/a/span/text()")) all_authors = "" if len(authors) == 0: return None # Process here is to compensate for multiple authors. for author in authors: all_authors += author + " " return all_authors except: return None
def _get_book_description(self, content): """ args: content (requests.get): content is needed in orde to scrape the book's description returns: description (String): description is the book's description that is being scraped. synopsis: The purpose of this function is to determine what the book's description """ try: desc_parts = Par_Scrape.parse(content, ("//*[@id='synopsistext']//text()")) # Process here is to compensate for returning empty variables # as well as acquiring all the text in between HTML tags. # Solution to the dynamic html formatting of google books descriptions. if len(desc_parts) == 0: return None full_desc = "" for parts in desc_parts: full_desc += parts return full_desc except: return None
def __get_authors__(self, content): try: authors_element = Par_Scrape.parse(content, "//*[@class='value-field Colaborador']")[0] authors = authors_element.text author = authors.split("Autor:")[-1] return author except: return None
def __get_subtitle__(self, content): try: subtitle_element = Par_Scrape.parse(content, "//*[@class='value-field Subtitulo']") if len(subtitle_element)>0: subtitle = subtitle_element[0].text return subtitle except: return None
def _get_book_series(self, content): """ args: content (requests.get): content is needed in order to get the series. returns: series (String): series is the book's series synopsis: The purpose of this function is to determine what the series is for the book being scraped (if it exists). """ try: return Par_Scrape.parse(content,".//tr[td='Series:']/td[@class='bookDetail']")[0].text except: return None
def _get_book_id(self, content): """ args: content (requests.get): content is needed in order to scrape the book's id. returns: id (String): id is the book's id, as determined by the website synopsis: The purpose of this function is to determine what the book's id that is being scraped. """ try: return Par_Scrape.parse(content,".//tr[td='ISBN 13#:']/td[@class='bookDetail']")[0].text except: return None
def _get_book_title(self, content): """ args: content (requests.get) content is required in order to scrape the book's title. returns: title (String): title is the book's title that is being scraped. synopsis: The purpose of this function is to determine what the book's title is. """ try: return Par_Scrape.parse(content, self.content_table + "//h1[@class='audiobookTitle']/text()")[0] except: return None
def _get_book_url(self, content): """ args: content (requests.get): content is needed in order to scrape the book's url. returns: url (String): url is book's url that is normally used, as determined by the website. synopsis: The purpose of this function is to determine what the book's url is that is being scraped. This is required in order for functions to work properly. """ try: return Par_Scrape.parse(content, "//*[@class='bookcover']/a/@href") except: return None
def _get_book_url(self, content): """ args: content (requests.get): content is needed in order to scrape the book's url. returns: url (String): url is book's url that is normally used, as determined by the website. synopsis: The purpose of this function is to determine what the book's url is that is being scraped. This is required in order for functions to work properly. """ try: return "http://*****:*****@class='bookDetail']")[0].text + "/details" except: return None
def _get_book_image_url(self, content): """ args: content (requests.get): content is required in order to scrape the book image's url. returns: image_url (String): image_url is the book's url for the book's cover image. synopsis: This purpose of this function is to determine what the url is for the book's cover image. """ try: return Par_Scrape.parse(content, "//*[@title='Front Cover']/@src")[0] except: return None
def _get_book_sale_price(self, content): """ args: content (Request.get): content is needed in order to scrape the audiobook's price if applicable returns: price: This is this price of the ebook. (None): If there is no ebook for the book searched. synopsis: The purpose of this function is to parse the to scrape for the audiobook's price and return it if applicable. """ try: return Par_Scrape.parse(content, self.content_table + "//div[@class='fleft button-text']/div/p/text()") except: return None
def __get_book_authors(self, content): """ args: content (requests.get): content is needed in order to get the authors names. returns: authors (String): authors is the book's authors synopsis: The purpose of this function is to determine what the authors are for the book being scraped. """ try: return Par_Scrape.parse( content, (self.right_col + "/div[@class='contributors']/p/span/a/text()"))[0] except: return None
def _get_book_format(self, content): """ args: returns: format (String): format is what type of book was scraped google books only has E-books available on their own site. synopsis: The purpose of this function is to return the book format "DIGITAL" or "PRINT" since google books only has E-books on their own site with links to "PRINT" books. """ try: if Par_Scrape.parse(content, "//*[@id='gb-get-book-not-available']"): return "PRINT" else: return "DIGITAL" except: return None
def _get_book_sale_status(self, content): """ args: content (requests.get): content is needed in order to scrape the book's subtitle returns: sale_status (Boolean): the sales status of the book that is being scraped. synopsis: The purpose of this function is to determine if the book is available for sale. """ try: if Par_Scrape.parse(content, "//*[@id='gb-get-book-not-available']"): return False else: return True except: return None
def _get_book_sale_price(self, content): """ args: content (Request.get): content is needed in order to scrape the e-book's price if applicable returns: price: This is this price of the ebook. (None): If there is no ebook for the book searched. synopsis: The purpose of this function is to parse the to scrape for the e-book's price and return it. """ try: if Par_Scrape.parse(content, "//*[@id='gb-get-book-not-available']"): return None except: return None
def __get_book_title(self, content): """ args: content (requests.get) content is required in order to scrape the book's title. returns: title (String): title is the book's title that is being scraped. synopsis: Thepurpose of this function is to determine what the book's title is. """ try: return Par_Scrape.parse( content, (self.meta_Father_Type + "/following-sibling::meta[@property='og:title']/@content"))[0] except: return None
def _get_book_sale_status(self, content): """ args: content (requests.get): content is needed in order to scrape the book's subtitle returns: sale_status (Boolean): the sales status of the book that is being scraped. synopsis: The purpose of this function is to determine if the book is available for sale. """ try: if Par_Scrape.parse(content, self.content_table + "//span[@class='nonmember-notify save-later-text']"): return False else: return True except: return None
def __get_book_url(self, content): """ args: content (requests.get): content is needed in order to scrape the book's url. returns: url (String): url is book's url that is normally used, as determined by the website. synopsis: The purpose of this function is to determine what the book's url is that is being scraped. This is required in order for functions to work properly. """ try: return Par_Scrape.parse( content, (self.meta_Father_Type + "/following-sibling::meta[@property='og:url']/@content"))[0] except: return None
def __get_book_image_url(self, content): """ args: content (requests.get): content is required in order to scrape the book's url. returns: image_url (String): image_url is the book's url for the book's cover image. synopsis: This purpose of this function is to determine what the book's url is for the cover image. """ try: return Par_Scrape.parse( content, (self.meta_Father_Type + "/following-sibling::meta[@property='og:image']/@content"))[0] except: return None
def _get_book_image_url(self, content): """ args: content (requests.get): content is required in order to scrape the book image's url. returns: image_url (String): image_url is the book's url for the book's cover image. synopsis: This purpose of this function is to determine what the url is for the book's cover image. """ try: # audiobooks has the image without the front end of an acceptable URL tail_url = Par_Scrape.parse(content, self.content_table + "//img[@class='book-cover']/@src")[0] full_url = "https:" + tail_url return full_url except: return None
def _get_book_volume_number(self, content): """ args: content (requests.get): content is needed in order to get the volume number. returns: volume_number (Int): volume_number is the book's volume_number synopsis: The purpose of this function is to determine what the volume_number is for the book being scraped (if it exists). """ try: volume_number = Par_Scrape.parse(content,".//tr[td='Volume#:']/td[@class='bookDetail']")[0].text if volume_number == "None": return "None" else: return int(volume_number) except: return None
def _get_ready_for_sale(self, content): """ args: content (requests.get): content is needed in order to scrape the book's release date. returns: ready_for_sale (Boolean): ready_for_sale is the book's availability, as determined by the website synopsis: The purpose of this function is to determine if the book is available or not. """ try: release_array = Par_Scrape.parse(content,".//tr[td='Release Date: ']/td[@class='bookDetail']")[0].text.split('/') release_date = date(int(release_array[0]),int(release_array[1]),int(release_array[2])) today = date.today() return release_date <= today except: return None
def __get_book_isbn_13(self, content): """ args: content (requests.get): content is needed in order to scrape the book's isbn_13. returns: isbn_13 (String): isbn_13 is the book's isbn_13 that is being scraped. synopsis: The purpose of this function is to determine the book's isbn_13. """ try: return Par_Scrape.parse( content, (self.meta_Father_Type + "/following-sibling::meta[@property='books:isbn']/@content" ))[0] except: return None