コード例 #1
0
def crawl_items():
    url = helpers.dequeue_items_url()
    if not url:
        helpers.log("WARNING: No URLs found in the queue. Retrying...")
        # pile.spawn(crawl_items)
        return
    product = Product(category="node",
                      list_url=url,
                      crawl_time=datetime.now(),
                      asin="",
                      title="",
                      product_url="",
                      price="",
                      img_url="",
                      img_path="")
    product.save()
    page, html = helpers.make_request(url)
    if not page:
        return
    next_link_tag = page.select("a#pagnNextLink")
    if next_link_tag:
        helpers.log(" Found 'Next' link on {}: {}".format(
            url, next_link_tag[0]["href"]))
        helpers.enqueue_items_url(next_link_tag[0]["href"])
    items = page.select('.s-result-list li.s-result-item')
    category = extractors.get_category(page)
    for item in items:
        asin = extractors.get_asin(item)
        title = extractors.get_title(item)
        product_url = extractors.get_url(item)
        list_url = url
        price = extractors.get_price(item)
        img_url = extractors.get_primary_img(item)
        img_path = extractors.download_img(img_url,
                                           category.split(":::")[-1], asin)
        product = Product(category=category,
                          asin=asin,
                          title=title,
                          product_url=product_url,
                          list_url=list_url,
                          price=price,
                          img_url=img_url,
                          img_path=img_path,
                          crawl_time=datetime.now())
        product.save()
    pile.spawn(crawl_items)
コード例 #2
0
ファイル: crawler.py プロジェクト: Datahman/amazon-crawler
def fetch_listing():

    global crawl_time
    url = dequeue_url()
    url = url.decode('utf-8')
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    page, html = make_request(url)
    if not page:
        return

    items = page.findAll("div", "s-result-item")
    log("Found {} items on {}".format(len(items), url))

    for item in items[:settings.max_details_per_listing]:

        product_image = get_primary_img(item)
        if not product_image:
            log("No product image detected, skipping")
            continue

        product_title = get_title(item)
        product_url = get_url(item)
        product_price = get_price(item)

        product = ProductRecord(
            title=product_title,
            product_url=format_url(product_url),
            listing_url=format_url(url),
            price=product_price,
            primary_img=product_image,
            crawl_time=crawl_time

        )
        product_id = product.save()
        # download_image(product_image, product_id)

    # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        enqueue_url(next_link["href"])
        pile.spawn(fetch_listing)
コード例 #3
0
def fetch_listing():
    '''
    This is the root function that green threads call.
    This is essentially step 1 (but step 0 is above!)
    '''
    global crawl_time

    # Pop a random URL from the Redis listing_url_queue
    url = helpers.dequeue_url()
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    page, html = helpers.make_request(url)
    if not page:
        return
    items = page.findAll("li", "s-result-item")
    log("Found {} items on {}".format(len(items), url))

    for item in items[:settings.max_details_per_listing]:
        product_image = extractors.get_primary_img(item)
        if not product_image:
            log("No product image detected, skipping")
            continue

        product_title = extractors.get_title(item)
        product_url = extractors.get_url(item)
        product_price = extractors.get_price(item)

        product = models.ProductRecord(
            title=product_title,
            product_url=helpers.format_url(product_url),
            listing_url=helpers.format_url(url),
            price=product_price,
            primary_img=product_image,
            crawl_time=crawl_time)
        product_id = product.save()
        helpers.download_image(product_image, product_id)

    # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        helpers.enqueue_url(next_link["href"])
        pile.spawn(fetch_listing)
コード例 #4
0
def fetch_listing():

    global crawl_time
    url = dequeue_url()
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    page, html = make_request(url)
    if not page:
        return

    items = page.findAll("li", "s-result-item")
    log("Found {} items on {}".format(len(items), url))

    for item in items[:settings.max_details_per_listing]:

        product_image = get_primary_img(item)
        if not product_image:
            log("No product image detected, skipping")
            continue

        product_title = get_title(item)
        product_url = get_url(item)
        product_price = get_price(item)

        product = ProductRecord(
            title=product_title,
            product_url=format_url(product_url),
            listing_url=format_url(url),
            price=product_price,
            primary_img=product_image,
            crawl_time=crawl_time

        )
        product_id = product.save()
        # download_image(product_image, product_id)

    # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        enqueue_url(next_link["href"])
        pile.spawn(fetch_listing)
コード例 #5
0
def fetch_listing(ASIN, marketplace):

    global crawl_time
    url = marketplace.country_host + "/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=" + ASIN
    if not url:
        log("WARNING: No URLs {} found in the queue. Retrying...".format(url))
        #pile.spawn(fetch_listing)
        return

    page, html = make_request(ASIN, marketplace.country_host)
    if not page:
        log("WARNING: No page. Retrying")
        #sleep(3)
        #fetch_listing(ASIN, marketplace)
    if page == None:
        return amazon_api(ASIN, url, marketplace.country_code)
    item = page
    product_image = get_primary_img(item)
    if not product_image:
        log("No product image detected, skipping")
        # continue
    product_title = get_title(item)
    product_url = get_url(item)
    product_price = get_price(item)
    product_indexing = get_indexing(item)
    if (product_title == '<missing product title>'
            and product_url == '<missing product url>'):
        product = amazon_api(ASIN, url, marketplace.country_code)
    else:
        product = ProductRecord(title=product_title,
                                product_url=format_url(product_url),
                                listing_url=format_url(url),
                                price=product_price,
                                primary_img=product_image,
                                product_indexing=product_indexing,
                                crawl_time=crawl_time,
                                asin=ASIN)
    return product