Esempio n. 1
0
def retrieve():
    list_link = "{}{}".format(BASE_URL, BOOK_LIST)
    html = retrieve_data("goodreads.top-books.html", list_link)
    soup = bs(html, "html.parser")
    rows = soup.find_all("tr", {"itemtype": "http://schema.org/Book"})

    for row in rows[:100]:
        link = row.find("div", {"data-resource-type": "Book"}).a["href"]
        book_link = "{}{}".format(BASE_URL, link)
        fname = "{}.{}.html".format("goodreads", link_to_fname(link))

        print("Fetching {}...".format(book_link))
        html = retrieve_data(fname, book_link)

        try:
            soup = bs(html, "html.parser")
            title = soup.find("h1", {"id": "bookTitle"}).get_text()
            title = clean_whitespace(title)
            description = soup.select("div#description span")[-1].get_text()
            description = clean_whitespace(description)
            link = soup.find("a", {"id": "buyButton"})["href"]
            genres = soup.select(".left .bookPageGenreLink")
            genres = [clean_whitespace(genre.get_text()) for genre in genres]
            image = soup.find("img", {"id": "coverImage"})["src"]
            if not image.startswith("http"):
                image = "{}{}".format(BASE_URL, image)
            product = Product(title, "{}{}".format(BASE_URL, link), image,
                              "books", genres, description)
            product.dump()
        except Exception as e:
            print("ERROR:", e)
        print("")
Esempio n. 2
0
def retrieve_products_for_interest(interest):
    list_url = "{}{}/{}-gifts{}".format(BASE_URL, LIST_URL, interest,
                                        QUERY_STR)
    html = retrieve_data("uncommon-goods.{}.html".format(interest), list_url)
    soup = bs(html, "html.parser")
    prod_links = [link["href"] for link in soup.select("article.product a")]

    for link in prod_links[:100]:
        prod_link = "{}{}".format(BASE_URL, link)
        fname = "{}.{}.html".format("uncommon-goods", link_to_fname(link))

        print("Fetching {}...".format(prod_link))
        html = retrieve_data(fname, prod_link)
        soup = bs(html, "html.parser")

        try:
            title = soup.find("h1", {"itemprop": "name"}).get_text()
            title = clean_whitespace(title)
            description = soup.select_one(".theStoryCopy p").get_text()
            description = clean_whitespace(description)
            image = soup.select_one("a#mainImage img")["src"]
            if not image.startswith("http"):
                image = "{}{}".format(BASE_URL, image)
            price = soup.find("span", {"itemprop": "price"}).get_text()
            price = float(clean_whitespace(price))
            tags = get_tags(description)
            product = Product(title,
                              "{}{}".format(BASE_URL, link),
                              image,
                              interest,
                              tags,
                              description,
                              price=price)
            product.dump()
        except Exception as e:
            print("ERROR:", e)
        print("")
Esempio n. 3
0
def retrieve():
    list_link = "{}{}".format(BASE_URL, FILM_LIST)
    html = retrieve_data("imdb.top-films.html", list_link)
    soup = bs(html, "html.parser")
    film_links = soup.select("tbody.lister-list tr .titleColumn a")
    film_links = [link["href"] for link in film_links]

    for link in film_links[:100]:
        film_link = "{}{}".format(BASE_URL, link)
        fname = "{}.{}.html".format("imdb", link_to_fname(link))

        print("Fetching {}...".format(film_link))
        html = retrieve_data(fname, film_link)
        soup = bs(html, "html.parser")

        try:
            title = soup.select_one(".title_wrapper h1").get_text()
            title = clean_whitespace(title)
            description = soup.select_one(".plot_summary .summary_text")
            description = clean_whitespace(description.get_text())
            image = soup.select_one(".poster a img")["src"]
            if not image.startswith("http"):
                image = "{}{}".format(BASE_URL, image)
            link = soup.select_one(".winner-option.watch-option")["data-href"]
            genres = soup.select(".title_wrapper .subtext a[href^=\"/genre\"]")
            genres = [clean_whitespace(genre.get_text()) for genre in genres]
            product = Product(title,
                              "{}{}".format(BASE_URL, link),
                              image,
                              "films",
                              genres,
                              description)
            product.dump()
        except Exception as e:
            print("ERROR:", e)
        print("")
Esempio n. 4
0
import json
from classes import Product
from constants import PRODUCT_PATH

if __name__ == "__main__":
    for product_fname in PRODUCT_PATH.iterdir():
        fpath = PRODUCT_PATH.joinpath(product_fname)
        print("Cleaning {}".format(fpath))
        with open(fpath) as f:
            data = json.load(f)
        price = data["price"]
        del data["price"]
        product = Product(*list(data.values()), price=price or 10)
        product.dump()