def search_for_asin_on_amazon(self, query): '''Search for book's asin on amazon using given query''' query = urlencode({'keywords': query}) url = '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query try: response = open_url(self._connections['amazon'], url) except PageDoesNotExist: return None # check to make sure there are results if ('did not match any products' in response and 'Did you mean:' not in response and 'so we searched in All Departments' not in response): return None soup = BeautifulSoup(response) results = soup.findAll('div', {'id': 'resultsCol'}) if not results: return None for result in results: if 'Buy now with 1-Click' in str(result): asin_search = AMAZON_ASIN_PAT.search(str(result)) if asin_search: return asin_search.group(1) return None
def get_settings(self, entity_id): '''Gets book's setting data''' if self._page_source is None: return settings = self._page_source.xpath('//div[@id="bookDataBox"]/div[@class="infoBoxRowItem"]/a[contains(@href, "/places/")]') settings_data = {} for setting in settings: if '/places/' not in setting.get('href'): continue label = setting.text resp = open_url(self._connection, setting.get('href')) if not resp: continue setting_page = html.fromstring(resp) if setting_page is None: continue desc = setting_page.xpath('//div[@class="mainContentContainer "]/div[@class="mainContent"]/div[@class="mainContentFloat"]/div[@class="leftContainer"]/span/text()') if len(desc) > 0 and re.sub(r'\s+', ' ', desc[0]).strip(): desc = unicode(re.sub(r'\s+', ' ', desc[0]).strip().decode('utf-8').encode('latin-1')) else: desc = u'No description found on Goodreads.' settings_data[entity_id] = {'label': unicode(label.decode('utf-8').encode('latin-1')), 'description': desc, 'aliases': []} entity_id += 1 return settings_data
def _read_secondary_author_pages(self, author_info): '''Reads secondary authors' page and gets their bios, image urls, and images encoded into base64''' if len(author_info) < 2: return for author in author_info[1:]: author['page'] = html.fromstring(open_url(self._connection, author['url'])) author['bio'] = self._get_author_bio(author['page']) author['image_url'] = self._get_author_image(author['page'])
def _read_secondary_author_pages(self, author_info): '''Reads secondary authors' page and gets their bios, image urls, and images encoded into base64''' if len(author_info) < 2: return for author in author_info[1:]: author['page'] = html.fromstring( open_url(self._connection, author['url'])) author['bio'] = self._get_author_bio(author['page']) author['image_url'] = self._get_author_image(author['page'])
def get_characters(self, entity_id): '''Gets book's character data''' if self._page_source is None: return characters = self._page_source.xpath( '//div[@class="clearFloats" and contains(., "Characters")]//div[@class="infoBoxRowItem"]//a' ) character_data = {} for char in characters: if '/characters/' not in char.get('href'): continue resp = open_url(self._connection, char.get('href')) if not resp: continue char_page = html.fromstring(resp) if char_page is None: continue desc = char_page.xpath( '//div[@class="workCharacterAboutClear"]/text()') if desc and re.sub(r'\s+', ' ', desc[0]).strip(): desc = str( re.sub(r'\s+', ' ', desc[0]).strip().decode('utf-8').encode('latin-1')) else: desc = 'No description found on Goodreads.' alias_list = char_page.xpath( '//div[@class="grey500BoxContent" and contains(.,"aliases")]/text()' ) alias_list = [ re.sub(r'\s+', ' ', x).strip() for aliases in alias_list for x in aliases.split(',') if re.sub(r'\s+', ' ', x).strip() ] character_data[entity_id] = { 'label': str(char.text.decode('utf-8').encode('latin-1')), 'description': desc, 'aliases': alias_list } entity_id += 1 if prefs['expand_aliases']: characters = {} for char, char_data in list(character_data.items()): characters[char] = [char_data['label']] + char_data['aliases'] expanded_aliases = auto_expand_aliases(characters) for alias, ent_id in list(expanded_aliases.items()): character_data[ent_id]['aliases'].append(alias) return character_data
def __init__(self, url, connection, asin): self._connection = connection self._asin = asin book_id_search = BOOK_ID_PAT.search(url) self._goodreads_book_id = book_id_search.group(1) if book_id_search else None response = open_url(self._connection, url) self._page_source = None if not response: return self._page_source = html.fromstring(response) self._author_recommendations = None self._author_other_books = []
def __init__(self, url, connection, asin): self._connection = connection self._asin = asin book_id_search = BOOK_ID_PAT.search(url) self._goodreads_book_id = book_id_search.group(1) if book_id_search else None response = open_url(self._connection, url) self._page_source = None if not response: return self._page_source = html.fromstring(response) self._author_recommendations = None self._author_other_books = []
def _parse_tooltip_info(self, book_data, book_id, image_url): '''Takes information retried from goodreads tooltips link and parses it''' title = book_data.xpath('//a[contains(@class, "readable")]') title = title[0].text if len(title) > 0 else None authors = book_data.xpath('//a[contains(@class, "authorName")]') authors = [authors[0].text] if len(authors) > 0 else None rating_info = book_data.xpath('//div[@class="bookRatingAndPublishing"]/span[@class="minirating"]') if len(rating_info) > 0: rating_string = rating_info[0].text_content().strip().replace(',', '').split() rating = float(rating_string[rating_string.index('avg')-1]) num_of_reviews = int(rating_string[-2]) else: rating = None num_of_reviews = None try: asin_elements = book_data.xpath('//a[contains(@class, "kindlePreviewButtonIcon")]/@href') book_asin = urlparse.parse_qs(urlparse.urlsplit(asin_elements[0]).query)["asin"][0] except (KeyError, IndexError): book_asin = None # We should get the ASIN from the tooltips file, but just in case we'll # keep this as a fallback (though this only works in some regions - just USA?) if not book_asin: asin_data_page = open_url(self._connection, '/buttons/glide/' + book_id) book_asin = GOODREADS_ASIN_PAT.search(asin_data_page) if not book_asin: return None book_asin = book_asin.group(1) desc = book_data.xpath('//div[@class="addBookTipDescription"]//span[not(contains(@id, "freeTextContainer"))]') desc_backup = book_data.xpath('//div[@class="addBookTipDescription"]//span[contains(@id, "freeTextContainer")]') if len(desc) > 0: desc = re.sub(r'\s+', ' ', desc[0].text).strip() elif len(desc_backup) > 0: desc = re.sub(r'\s+', ' ', desc_backup[0].text).strip() else: return None return {'class': 'featuredRecommendation', 'asin': book_asin, 'title': title, 'authors': authors, 'imageUrl': image_url, 'description': desc, 'hasSample': False, 'amazonRating': rating, 'numberOfReviews': num_of_reviews}
def _parse_tooltip_info(self, book_data, book_id, image_url): '''Takes information retried from goodreads tooltips link and parses it''' title = book_data.xpath('//a[contains(@class, "readable")]') title = title[0].text if len(title) > 0 else None authors = book_data.xpath('//a[contains(@class, "authorName")]') authors = [authors[0].text] if len(authors) > 0 else None rating_info = book_data.xpath('//div[@class="bookRatingAndPublishing"]/span[@class="minirating"]') if len(rating_info) > 0: rating_string = rating_info[0].text_content().strip().replace(',', '').split() rating = float(rating_string[rating_string.index('avg')-1]) num_of_reviews = int(rating_string[-2]) else: rating = None num_of_reviews = None try: asin_elements = book_data.xpath('//a[contains(@class, "kindlePreviewButtonIcon")]/@href') book_asin = urlparse.parse_qs(urlparse.urlsplit(asin_elements[0]).query)["asin"][0] except (KeyError, IndexError): book_asin = None # We should get the ASIN from the tooltips file, but just in case we'll # keep this as a fallback (though this only works in some regions - just USA?) if not book_asin: asin_data_page = open_url(self._connection, '/buttons/glide/' + book_id) book_asin = GOODREADS_ASIN_PAT.search(asin_data_page) if not book_asin: return None book_asin = book_asin.group(1) desc = book_data.xpath('//div[@class="addBookTipDescription"]//span[not(contains(@id, "freeTextContainer"))]') desc_backup = book_data.xpath('//div[@class="addBookTipDescription"]//span[contains(@id, "freeTextContainer")]') if len(desc) > 0: desc = re.sub(r'\s+', ' ', desc[0].text).strip() elif len(desc_backup) > 0: desc = re.sub(r'\s+', ' ', desc_backup[0].text).strip() else: return None return {'class': 'featuredRecommendation', 'asin': book_asin, 'title': title, 'authors': authors, 'imageUrl': image_url, 'description': desc, 'hasSample': False, 'amazonRating': rating, 'numberOfReviews': num_of_reviews}
def search_for_asin_on_goodreads(self, url): '''Searches for ASIN of book at given url''' book_id_search = BOOK_ID_PAT.search(url) if not book_id_search: return None book_id = book_id_search.group(1) try: response = open_url(self._connections['goodreads'], '/buttons/glide/' + book_id) except PageDoesNotExist: return None book_asin_search = GOODREADS_ASIN_PAT.search(response) if not book_asin_search: return None return book_asin_search.group(1)
def search_for_goodreads_url(self, keywords): '''Searches for book's goodreads url using given keywords''' query = urlencode({'q': keywords}) try: response = open_url(self._connections['goodreads'], '/search?' + query) except PageDoesNotExist: return None # check to make sure there are results if 'No results' in response: return None urlsearch = GOODREADS_URL_PAT.search(response) if not urlsearch: return None # return the full URL with the query parameters removed url = 'https://www.goodreads.com' + urlsearch.group(1) return urlparse.urlparse(url)._replace(query=None).geturl()
def get_characters(self, entity_id): '''Gets book's character data''' if self._page_source is None: return characters = self._page_source.xpath('//div[@class="clearFloats" and contains(., "Characters")]//div[@class="infoBoxRowItem"]//a') character_data = {} for char in characters: if '/characters/' not in char.get('href'): continue resp = open_url(self._connection, char.get('href')) if not resp: continue char_page = html.fromstring(resp) if char_page is None: continue desc = char_page.xpath('//div[@class="workCharacterAboutClear"]/text()') if desc and re.sub(r'\s+', ' ', desc[0]).strip(): desc = unicode(re.sub(r'\s+', ' ', desc[0]).strip().decode('utf-8').encode('latin-1')) else: desc = u'No description found on Goodreads.' alias_list = char_page.xpath('//div[@class="grey500BoxContent" and contains(.,"aliases")]/text()') alias_list = [re.sub(r'\s+', ' ', x).strip() for aliases in alias_list for x in aliases.split(',') if re.sub(r'\s+', ' ', x).strip()] character_data[entity_id] = {'label': unicode(char.text.decode('utf-8').encode('latin-1')), 'description': desc, 'aliases': alias_list} entity_id += 1 if prefs['expand_aliases']: characters = {} for char, char_data in character_data.items(): characters[char] = [char_data['label']] + char_data['aliases'] expanded_aliases = auto_expand_aliases(characters) for alias, ent_id in expanded_aliases.items(): character_data[ent_id]['aliases'].append(alias) return character_data
def _get_book_info_from_tooltips(self, book_info): '''Gets books ASIN, title, authors, image url, description, and rating information''' if isinstance(book_info, tuple): book_info = [book_info] books_data = [] link_pattern = 'resources[Book.{0}][type]=Book&resources[Book.{0}][id]={0}' tooltips_page_url = '/tooltips?' + "&".join([link_pattern.format(book_id) for book_id, image_url in book_info]) tooltips_page_info = json.loads(open_url(self._connection, tooltips_page_url))['tooltips'] for book_id, image_url in book_info: book_data = tooltips_page_info['Book.{0}'.format(book_id)] if not book_data: continue book_data = html.fromstring(book_data) parsed_data = self._parse_tooltip_info(book_data, book_id, image_url) if not parsed_data: continue books_data.append(parsed_data) return books_data
def _get_book_info_from_tooltips(self, book_info): '''Gets books ASIN, title, authors, image url, description, and rating information''' if isinstance(book_info, tuple): book_info = [book_info] books_data = [] link_pattern = 'resources[Book.{0}][type]=Book&resources[Book.{0}][id]={0}' tooltips_page_url = '/tooltips?' + "&".join([link_pattern.format(book_id) for book_id, image_url in book_info]) tooltips_page_info = json.loads(open_url(self._connection, tooltips_page_url))['tooltips'] for book_id, image_url in book_info: book_data = tooltips_page_info['Book.{0}'.format(book_id)] if not book_data: continue book_data = html.fromstring(book_data) parsed_data = self._parse_tooltip_info(book_data, book_id, image_url) if not parsed_data: continue books_data.append(parsed_data) return books_data
def _get_quotes(self): '''Gets book's quote data''' if self._page_source is None: return quotes_page = self._page_source.xpath('//a[@class="actionLink" and contains(., "More quotes")]') quotes = [] if len(quotes_page) > 0: resp = open_url(self._connection, quotes_page[0].get('href')) if not resp: return quotes_page = html.fromstring(resp) if quotes_page is None: return for quote in quotes_page.xpath('//div[@class="quoteText"]'): quotes.append(re.sub(r'\s+', ' ', quote.text).strip().decode('ascii', 'ignore')) else: for quote in self._page_source.xpath('//div[@class=" clearFloats bigBox" and contains(., "Quotes from")]//div[@class="bigBoxContent containerWithHeaderContent"]//span[@class="readable"]'): quotes.append(re.sub(r'\s+', ' ', quote.text).strip().decode('ascii', 'ignore')) return quotes
def _get_quotes(self): '''Gets book's quote data''' if self._page_source is None: return quotes_page = self._page_source.xpath('//a[@class="actionLink" and contains(., "More quotes")]') quotes = [] if len(quotes_page) > 0: resp = open_url(self._connection, quotes_page[0].get('href')) if not resp: return quotes_page = html.fromstring(resp) if quotes_page is None: return for quote in quotes_page.xpath('//div[@class="quoteText"]'): quotes.append(re.sub(r'\s+', ' ', quote.text).strip().decode('ascii', 'ignore')) else: for quote in self._page_source.xpath('//div[@class=" clearFloats bigBox" and contains(., "Quotes from")]//div[@class="bigBoxContent containerWithHeaderContent"]//span[@class="readable"]'): quotes.append(re.sub(r'\s+', ' ', quote.text).strip().decode('ascii', 'ignore')) return quotes
def get_settings(self, entity_id): '''Gets book's setting data''' if self._page_source is None: return settings = self._page_source.xpath( '//div[@id="bookDataBox"]/div[@class="infoBoxRowItem"]/a[contains(@href, "/places/")]' ) settings_data = {} for setting in settings: if '/places/' not in setting.get('href'): continue label = setting.text resp = open_url(self._connection, setting.get('href')) if not resp: continue setting_page = html.fromstring(resp) if setting_page is None: continue desc = setting_page.xpath( '//div[@class="mainContentContainer "]/div[@class="mainContent"]/div[@class="mainContentFloat"]/div[@class="leftContainer"]/span/text()' ) if len(desc) > 0 and re.sub(r'\s+', ' ', desc[0]).strip(): desc = str( re.sub(r'\s+', ' ', desc[0]).strip().decode('utf-8').encode('latin-1')) else: desc = 'No description found on Goodreads.' settings_data[entity_id] = { 'label': str(label.decode('utf-8').encode('latin-1')), 'description': desc, 'aliases': [] } entity_id += 1 return settings_data
def _read_primary_author_page(self, author_info): '''Rreads primary author's page and gets his/her bio, image url, and image encoded into base64''' author = author_info[0] author['page'] = html.fromstring(open_url(self._connection, author['url'])) author['bio'] = self._get_author_bio(author['page']) author['image_url'], author['encoded_image'] = self._get_author_image(author['page'], encode_image=True)
def _read_primary_author_page(self, author_info): '''Rreads primary author's page and gets his/her bio, image url, and image encoded into base64''' author = author_info[0] author['page'] = html.fromstring(open_url(self._connection, author['url'])) author['bio'] = self._get_author_bio(author['page']) author['image_url'], author['encoded_image'] = self._get_author_image(author['page'], encode_image=True)