def get_product_info(self, url, page, category_code): url_sanitazed = format_url(url) title = self.get_product_title(page) primary_image = self.get_product_primary_image(page) price = self.get_product_price(page) features = self.get_product_features(page) extra_info = self.get_product_extra_info(page, category_code) if not primary_image: log("No product image detected, skipping") product = ProductRecord( title=title, product_url=url_sanitazed, listing_url=url_sanitazed, # TODO: delete attr price=price, primary_img=primary_image, crawl_time=None, category_code=category_code, category=CATEGORY_LABELS[int(category_code)], features=features, asin=extra_info.get('asin'), dimensions=extra_info.get('product_dimensions'), weight=extra_info.get('product_weight'), shipping_weight=extra_info.get('shipping_weight'), package_dimensions=extra_info.get('product_dimensions'), package_weight=extra_info.get('package_weight'), ) product_id = product.save() if product_id: log('Product saved! {}'.format(product_id))
def handle_listing(page, platform, url, sql_handler): if platform == 'indeed': title = extractors.extract_element_text( page, 'h1', {'class': 'jobsearch-JobInfoHeader-title'}) company = extractors.extract_element_text( page, 'div', {'class': 'jobsearch-CompanyReview--heading'}) if not company: company = extractors.extract_element_text( page, 'div', {'class': 'icl-u-lg-mr--sm icl-u-xs-mr--xs'}) job_meta_header = extractors.extract_element_text( page, 'span', {'class': 'jobsearch-JobMetadataHeader-item'}) desc = extractors.extract_element_text(page, 'div', {'id': 'jobDescriptionText'}) url = extractors.extract_element_attr_value(page, 'meta', {'id': 'indeed-share-url'}, 'content') job_id = helpers.get_url_param_value(url, 'jk') date = extractors.extract_indeed_job_footer_text(page) sql_handler.save_indeed_job(job_id=job_id, date=date, company=company, title=title, job_meta=job_meta_header, text=desc, url=url, platform=platform) if platform == 'twitter': next_token = handle_twitter_response(page) while next_token: token_url = helpers.format_url( url, platform, add_param={'pagination_token': next_token}) page = helpers.make_request(token_url, platform) next_token = handle_twitter_response(page) if platform == 'Volkswagen_press': id = platform + '_' + helpers.get_url_path_element(url, -1) title = extractors.extract_element_text(page, 'h1', {'class': 'page--title'}) company = "Volkswagen" date = extractors.extract_element_text(page, 'div', {'class': 'meta--item'}, 0) date_string = extractors.extract_date_string_from_text(date, platform) meta_topics = extractors.extract_child_element_text( page, 'div', {'class': 'meta--item'}, 'a', {'content-link': ''}, 2, 0) short_summary = extractors.extract_list_text_by_parent( page, 'div', {'class': 'topic-list'}) summary = extractors.extract_child_element_text( page, 'div', {'class': 'page-item--intro'}, 'p', None, 0, 0) text = extractors.extract_concatinated_text_by_element( page, 'div', {'class': 'page-item--text'}, 'p') sql_handler.save_press_release(release_id=id, company=company, release_date=date_string, topics=meta_topics, url=url, title=title, short_summary=short_summary, summary=summary, text=text)
def fetch_listing(): global crawl_time url = dequeue_url() url = url.decode('utf-8') if not url: log("WARNING: No URLs found in the queue. Retrying...") pile.spawn(fetch_listing) return page, html = make_request(url) if not page: return items = page.findAll("div", "s-result-item") log("Found {} items on {}".format(len(items), url)) for item in items[:settings.max_details_per_listing]: product_image = get_primary_img(item) if not product_image: log("No product image detected, skipping") continue product_title = get_title(item) product_url = get_url(item) product_price = get_price(item) product = ProductRecord( title=product_title, product_url=format_url(product_url), listing_url=format_url(url), price=product_price, primary_img=product_image, crawl_time=crawl_time ) product_id = product.save() # download_image(product_image, product_id) # add next page to queue next_link = page.find("a", id="pagnNextLink") if next_link: log(" Found 'Next' link on {}: {}".format(url, next_link["href"])) enqueue_url(next_link["href"]) pile.spawn(fetch_listing)
def fetch_listing(): ''' This is the root function that green threads call. This is essentially step 1 (but step 0 is above!) ''' global crawl_time # Pop a random URL from the Redis listing_url_queue url = helpers.dequeue_url() if not url: log("WARNING: No URLs found in the queue. Retrying...") pile.spawn(fetch_listing) return page, html = helpers.make_request(url) if not page: return items = page.findAll("li", "s-result-item") log("Found {} items on {}".format(len(items), url)) for item in items[:settings.max_details_per_listing]: product_image = extractors.get_primary_img(item) if not product_image: log("No product image detected, skipping") continue product_title = extractors.get_title(item) product_url = extractors.get_url(item) product_price = extractors.get_price(item) product = models.ProductRecord( title=product_title, product_url=helpers.format_url(product_url), listing_url=helpers.format_url(url), price=product_price, primary_img=product_image, crawl_time=crawl_time) product_id = product.save() helpers.download_image(product_image, product_id) # add next page to queue next_link = page.find("a", id="pagnNextLink") if next_link: log(" Found 'Next' link on {}: {}".format(url, next_link["href"])) helpers.enqueue_url(next_link["href"]) pile.spawn(fetch_listing)
def fetch_listing(): global crawl_time url = dequeue_url() if not url: log("WARNING: No URLs found in the queue. Retrying...") pile.spawn(fetch_listing) return page, html = make_request(url) if not page: return items = page.findAll("li", "s-result-item") log("Found {} items on {}".format(len(items), url)) for item in items[:settings.max_details_per_listing]: product_image = get_primary_img(item) if not product_image: log("No product image detected, skipping") continue product_title = get_title(item) product_url = get_url(item) product_price = get_price(item) product = ProductRecord( title=product_title, product_url=format_url(product_url), listing_url=format_url(url), price=product_price, primary_img=product_image, crawl_time=crawl_time ) product_id = product.save() # download_image(product_image, product_id) # add next page to queue next_link = page.find("a", id="pagnNextLink") if next_link: log(" Found 'Next' link on {}: {}".format(url, next_link["href"])) enqueue_url(next_link["href"]) pile.spawn(fetch_listing)
def get_url(item): try: link_tag = item.select("a.s-access-detail-page")[0] if link_tag: return format_url(link_tag['href']) else: return "<missing product url>" except IndexError as err: return "<missing product url>" except Exception as e: return "<missing product url>"
def fetch_listing(ASIN, marketplace): global crawl_time url = marketplace.country_host + "/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=" + ASIN if not url: log("WARNING: No URLs {} found in the queue. Retrying...".format(url)) #pile.spawn(fetch_listing) return page, html = make_request(ASIN, marketplace.country_host) if not page: log("WARNING: No page. Retrying") #sleep(3) #fetch_listing(ASIN, marketplace) if page == None: return amazon_api(ASIN, url, marketplace.country_code) item = page product_image = get_primary_img(item) if not product_image: log("No product image detected, skipping") # continue product_title = get_title(item) product_url = get_url(item) product_price = get_price(item) product_indexing = get_indexing(item) if (product_title == '<missing product title>' and product_url == '<missing product url>'): product = amazon_api(ASIN, url, marketplace.country_code) else: product = ProductRecord(title=product_title, product_url=format_url(product_url), listing_url=format_url(url), price=product_price, primary_img=product_image, product_indexing=product_indexing, crawl_time=crawl_time, asin=ASIN) return product
def fetch_listing(start, end): global crawl_time with open('amazon-products.p', 'rb') as pf: product_dict = pickle.load(pf) index = start - 1 count = 0 for product_url in product_urls: #print product_url index += 1 page1, html1 = make_request(product_url) try: # visit the page specified by product_url temp_dict = {} product_title = product_dict[product_url] product_price = page1.find( "span", "a-size-medium a-color-price").get_text().strip() #extract product info from comparison_table table = page1.find( "table", "a-bordered a-horizontal-stripes a-spacing-mini a-size-base comparison_table" ) for i in table.findAll("tr"): if "a-span3 comparison_attribute_name_column comparison_table_first_col" in str( i): k = i.find("td").find("span").get_text() v = i.find("th").find("span").get_text() temp_dict[v] = k #extract product info from product details Table tables = page1.findAll("table", "a-keyvalue prodDetTable") for table2 in tables: for i in table2.findAll("tr"): k = i.find("td").get_text().strip() v = i.find("th").get_text().strip() temp_dict[v] = k product = ProductRecord(title=product_title, product_url=format_url(product_url), price=product_price, properties=temp_dict) product_name = settings.a_products_path + str(index) + ".p" pickle.dump(product, open(product_name, 'wb')) #print_product(index) count += 1 print(count, index, product_price) sys.stdout.flush() except Exception as e: print "Exception##:" + str(index) + '\t' + str(e)
def dump_urls(): visited = {} with open(settings.w_URL_file, 'w') as w: while queue: # while queue is not empty url = dequeue_url() if not url: log("Queue empty") return if url in visited: # we've already seen this product continue else: visited[url] = True # mark that we've seen it # need to add host to url url = format_url(url, walmart=True) w.write('%s\n' % url)
def begin_crawl(crawl_more): visited = {} product_dict = {} if crawl_more: with open(settings.a_URL_file, 'r') as w: urls = (w.readlines()) for url in urls: url = url.strip() visited[url] = True w = open(settings.a_URL_file, 'a') with open(settings.start_file, "r") as f: for line in f: line = line.strip() if not line or line.startswith("#"): continue # skip blank and commented out lines page, html = make_request(line) url = line count = 0 while page != None and count <= 50: items = page.findAll("li", "s-result-item") for item in items[:settings.max_details_per_listing]: product_image = get_primary_img(item) if not product_image: continue product_title = get_title(item) product_url = get_url(item) product_price = get_price(item) if product_url not in visited: count += 1 print product_url, product_price, product_title visited[product_url] = True # mark that we've seen it # need to add host to url product_url = format_url(product_url) w.write('%s\n' % product_url) product_dict[product_url] = (product_title, product_price) print count, product_url, product_dict[product_url] next_link = page.find("a", id="pagnNextLink") if next_link: page, html = make_request(next_link["href"]) url = next_link["href"] w.close() pickle.dump(product_dict, open("amazon-products.p", "wb"))
def begin_crawl(session): # explode out all of our category `start_urls` into subcategories with open(settings.w_start_file, "r") as f: session = dryscrape.Session() for line in f: line = line.strip() if not line or line.startswith("#"): continue # skip blank and commented out lines url = line session.visit(url) response = session.body() soup = BeautifulSoup(response, "html5lib") count = 0 i = 1 # starting page while soup != None: print 'page %d of link: %s' % (i, line) # look for products listed on this page results = soup.findAll('div', 'search-result-gridview-item clearfix' ) # items in gridview for result in results: link = result.find('a') if not link: continue link = link['href'] count += 1 enqueue_url(link) i += 1 # go to list of pages at bottom p_list = soup.find('ul', 'paginator-list').findAll('li') for p in p_list: # search for 'next' ordinal page, visit that next page for next iteration of while loop if not p.has_attr('class') and str(i) in p.find('a').text: url = format_url(p.find('a')['href'], walmart=True) session.visit(url) response = session.body() soup = BeautifulSoup(response, "html5lib") break else: soup = None # if None for all, there is no next page and we can stop searching this link log("Found {} results on {}".format(count, line))