def retrieve(): list_link = "{}{}".format(BASE_URL, BOOK_LIST) html = retrieve_data("goodreads.top-books.html", list_link) soup = bs(html, "html.parser") rows = soup.find_all("tr", {"itemtype": "http://schema.org/Book"}) for row in rows[:100]: link = row.find("div", {"data-resource-type": "Book"}).a["href"] book_link = "{}{}".format(BASE_URL, link) fname = "{}.{}.html".format("goodreads", link_to_fname(link)) print("Fetching {}...".format(book_link)) html = retrieve_data(fname, book_link) try: soup = bs(html, "html.parser") title = soup.find("h1", {"id": "bookTitle"}).get_text() title = clean_whitespace(title) description = soup.select("div#description span")[-1].get_text() description = clean_whitespace(description) link = soup.find("a", {"id": "buyButton"})["href"] genres = soup.select(".left .bookPageGenreLink") genres = [clean_whitespace(genre.get_text()) for genre in genres] image = soup.find("img", {"id": "coverImage"})["src"] if not image.startswith("http"): image = "{}{}".format(BASE_URL, image) product = Product(title, "{}{}".format(BASE_URL, link), image, "books", genres, description) product.dump() except Exception as e: print("ERROR:", e) print("")
def retrieve_products_for_interest(interest): list_url = "{}{}/{}-gifts{}".format(BASE_URL, LIST_URL, interest, QUERY_STR) html = retrieve_data("uncommon-goods.{}.html".format(interest), list_url) soup = bs(html, "html.parser") prod_links = [link["href"] for link in soup.select("article.product a")] for link in prod_links[:100]: prod_link = "{}{}".format(BASE_URL, link) fname = "{}.{}.html".format("uncommon-goods", link_to_fname(link)) print("Fetching {}...".format(prod_link)) html = retrieve_data(fname, prod_link) soup = bs(html, "html.parser") try: title = soup.find("h1", {"itemprop": "name"}).get_text() title = clean_whitespace(title) description = soup.select_one(".theStoryCopy p").get_text() description = clean_whitespace(description) image = soup.select_one("a#mainImage img")["src"] if not image.startswith("http"): image = "{}{}".format(BASE_URL, image) price = soup.find("span", {"itemprop": "price"}).get_text() price = float(clean_whitespace(price)) tags = get_tags(description) product = Product(title, "{}{}".format(BASE_URL, link), image, interest, tags, description, price=price) product.dump() except Exception as e: print("ERROR:", e) print("")
def retrieve(): list_link = "{}{}".format(BASE_URL, FILM_LIST) html = retrieve_data("imdb.top-films.html", list_link) soup = bs(html, "html.parser") film_links = soup.select("tbody.lister-list tr .titleColumn a") film_links = [link["href"] for link in film_links] for link in film_links[:100]: film_link = "{}{}".format(BASE_URL, link) fname = "{}.{}.html".format("imdb", link_to_fname(link)) print("Fetching {}...".format(film_link)) html = retrieve_data(fname, film_link) soup = bs(html, "html.parser") try: title = soup.select_one(".title_wrapper h1").get_text() title = clean_whitespace(title) description = soup.select_one(".plot_summary .summary_text") description = clean_whitespace(description.get_text()) image = soup.select_one(".poster a img")["src"] if not image.startswith("http"): image = "{}{}".format(BASE_URL, image) link = soup.select_one(".winner-option.watch-option")["data-href"] genres = soup.select(".title_wrapper .subtext a[href^=\"/genre\"]") genres = [clean_whitespace(genre.get_text()) for genre in genres] product = Product(title, "{}{}".format(BASE_URL, link), image, "films", genres, description) product.dump() except Exception as e: print("ERROR:", e) print("")
import json from classes import Product from constants import PRODUCT_PATH if __name__ == "__main__": for product_fname in PRODUCT_PATH.iterdir(): fpath = PRODUCT_PATH.joinpath(product_fname) print("Cleaning {}".format(fpath)) with open(fpath) as f: data = json.load(f) price = data["price"] del data["price"] product = Product(*list(data.values()), price=price or 10) product.dump()