def asin_to_first_review_url(self, asin): listing_url = "https://www.amazon.com/dp/" + asin soup = amazon_module.download_soup_by_url(listing_url) first_review_url_part2 = soup.find( id="dp-summary-see-all-reviews")["href"] first_review_url = "https://www.amazon.com" + first_review_url_part2 return first_review_url
def keyword_to_long_tail_keyword_list(keyword): try: print("keyword:", keyword) # ------这样的横线对应搜索词中字符的数量,如果-有多,可能是搜索词前后有空格 print("-" * (len("keyword: ") + len(keyword))) url_head = "https://completion.amazon.com/search/complete?method=completion&mkt=1&r=Y5KKREBZPVVDRZT19HX9&s=133-8959284-8300960&c=&p=Gateway&l=en_US&b2b=0&fresh=0&sv=desktop&client=amazon-search-ui&x=String&search-alias=aps&q=" url_tail = "&qs=&cf=1&fb=1&sc=1&" try: keyword = keyword.replace(" ", "%20") keyword = keyword.replace("'", "%27") url = url_head + keyword + url_tail soup = amazon_module.download_soup_by_url(url) soup_string = soup.get_text() soup_string = soup_string[13:-11] soup_list = eval(soup_string) long_tail_keyword_list = [] for long_tail_keyword in soup_list[1]: print(long_tail_keyword) long_tail_keyword_list.append(long_tail_keyword) print("") return(long_tail_keyword_list) except: print("can't find long tail words") except: print("can't find long tail words")
def keyword_to_long_tail_keyword_list(keyword): try: # print("keyword:", keyword) # print("-" * (len("keyword: ") + len(keyword))) keyword = keyword.replace(" ", "%20") keyword = keyword.replace("'", "%27") if keyword[0] == "*": url_head = "https://completion.amazon.com/search/complete?method=completion&mkt=1&r=X8QW0QJV6AP2J4TJAZM4&s=140-5560419-0294343&c=&p=Gateway&l=en_US&b2b=0&fresh=0&sv=desktop&client=amazon-search-ui&x=String&search-alias=aps&ks=8&q=*&qs=" url_tail = "&cf=1&fb=1&sc=1&" else: url_head = "https://completion.amazon.com/search/complete?method=completion&mkt=1&r=X8QW0QJV6AP2J4TJAZM4&s=140-5560419-0294343&c=&p=Gateway&l=en_US&b2b=0&fresh=0&sv=desktop&client=amazon-search-ui&x=String&search-alias=aps&ks=8&q=" url_tail = "&cf=1&fb=1&sc=1&" url = url_head + keyword + url_tail soup = amazon_module.download_soup_by_url(url) soup_string = soup.get_text() soup_string = soup_string[13:-11] soup_list = eval(soup_string) long_tail_keyword_list = [] for long_tail_keyword in soup_list[1]: # print(long_tail_keyword) long_tail_keyword_list.append(long_tail_keyword) return (long_tail_keyword_list) except: print("can't find long tail words")
def keyword_to_asin_list(self): print("brand_to_asin_list is running...") brands = open('./brands', 'r') brand_list = [] for brand in brands: try: base_url = "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=" keyword_with_plus = "+".join(brand.split()) first_page_url = base_url + keyword_with_plus get_url_sleep_time = random.randint(0, 5) soup = amazon_module.download_soup_by_url(first_page_url) tag_content = soup.get_text().encode("utf-8") if re.search(r"No results for", tag_content): self.listing_info_dict["brand"] = brand self.listing_info_dict["serch_url"] = first_page_url self.listing_info_dict["state"] = 'no results' else: self.listing_info_dict["brand"] = brand self.listing_info_dict["serch_url"] = first_page_url self.listing_info_dict["state"] = 'yes' try: self.listing_info_dict_to_csv_file() except: pass time.sleep(get_url_sleep_time) except Exception as e: print("{}".format(e))
def keyword_to_asin_list(self): try: print("Start running, may take a few minutes") base_url = "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=" keyword_with_plus = "+".join(self.keyword.split()) first_page_url = base_url + keyword_with_plus soup = amazon_module.download_soup_by_url(first_page_url) temp_others_asin_list = [] try: lis = soup.find_all("li", class_="s-result-item") for li in lis: try: asin = li["data-asin"] temp_others_asin_list.append(asin) except: pass # remove duplicate asin no_repeat_others_asin_list = [] for asin in temp_others_asin_list: if asin not in no_repeat_others_asin_list: no_repeat_others_asin_list.append(asin) return no_repeat_others_asin_list except: pass except: pass
def asin_to_sponsored_asins(self): url = "https://www.amazon.com/dp/" + self.others_asin lis = amazon_module.download_soup_by_url(url).find( id="sp_detail").find("ol").find_all("li") sponsored_asin_list = [] for li in lis: sponsored_asin = li.find("div")["data-asin"] sponsored_asin_list.append(sponsored_asin) return sponsored_asin_list
def asin_to_size_weight(asin): # asin = "B075YV1BT8" url_head = "https://sellercentral.amazon.com/fba/profitabilitycalculator/productmatches?searchKey=" url_tail = "&language=en_US&profitcalcToken=p9FcdMuSse7SGDBTzP9EgOn9nuQj3D" url = url_head + asin + url_tail soup = amazon_module.download_soup_by_url(url) soup_text = soup.get_text() soup_dict = json.loads(soup_text) item_info_dict = soup_dict['data'][0] # for k, v in item_info.items(): # print(k, ":", v) return item_info_dict
def storefront_url_to_store_url_list(self): store_url_list = [] store_url_list.append(self.storefront_url) page = self.max_page if page == 1: return store_url_list while page > 1: soup = amazon_module.download_soup_by_url(store_url_list[-1]) try: if soup.find(id="pagnNextLink")['href']: next_page_url_part2 = soup.find(id="pagnNextLink")['href'] next_page_url = "https://www.amazon.com" + next_page_url_part2 store_url_list.append(next_page_url) except: return store_url_list page = page - 1 return store_url_list
def page_url_to_asin_list(url): asin_list = [] try: page_soup = amazon_module.download_soup_by_url(url) page_lis = page_soup.find(id="zg-ordered-list").find_all( "li", class_="zg-item-immersion") for page_li_index, page_li in enumerate(page_lis): try: link = page_li.find("a")['href'] asin = re.findall(r"dp/(.*?)/ref", link)[0] badge = page_li.find("span", class_="zg-badge-text").get_text() print(page_li_index, asin, badge) asin_list.append(asin + badge) except: pass return asin_list except: pass
def asin_to_offer_listing(asin, now, country=None): print("offer asin: ", asin) url = "https://www.amazon.com/gp/offer-listing/" + asin + "/dp_olp_new_mbc?ie=UTF8&condition=new" if country: url = "https://www.amazon.co.uk/gp/offer-listing/" + asin + "/dp_olp_new_mbc?ie=UTF8&condition=new" soup = amazon_module.download_soup_by_url(url) offering_list = [] try: if soup.find("div", id="olpOfferList"): divs = soup.find("div", id="olpOfferList").find_all( "div", class_="a-row a-spacing-mini olpOffer") num = 0 for div in divs: store_name = "Amazon" store_url = "https://www.amazon.com" store_price = "" span_name = div.find("span", class_="a-size-medium a-text-bold") if span_name: store_name = span_name.get_text().strip() store_url = store_url + span_name.find("a")['href'] span_price = div.find( "span", class_= "a-size-large a-color-price olpOfferPrice a-text-bold") if span_price: store_price = span_price.get_text().strip() store_price = re.search('(\d*\.\d*)', store_price) store_price = store_price.group() num = num + 1 offer_dict = { "detail_id": now, "offer_num": num, "offer_asin": asin, "offer_name": store_name, "offer_price": store_price, "offer_url": store_url, } offering_list.append(offer_dict) except Exception as e: print("fail to asin_to_offer_listing: {}".format(e)) pass return offering_list
def keyword_to_amz_rlt(keyword): try: # print("keyword:", keyword) url_head = "https://www.amazon.com/s/ref=nb_sb_noss/147-7192934-0083761?url=search-alias%3Daps&field-keywords=" url = url_head + keyword soup = amazon_module.download_soup_by_url(url) results = soup.find(id="s-result-count").get_text() results_text = results.split(":")[0] # print(results_text) m = re.search(r"of (.*?) results", results) results = m.group() results = results.replace("of ", "").replace(" results", "").replace(",", "") # print(results) return (results) except: print("fail to find results")
def store_url_to_asin_list(self, store_url): soup = amazon_module.download_soup_by_url(store_url) # lis = soup.find(id="s-results-list-atf").find_all("li") lis = soup.find_all("li", class_="celwidget") asin_list = [] for index, li in enumerate(lis): self.asin = li["data-asin"] self.listing_info_dict = self.asin_to_listing_info() self.listing_info_dict_to_csv_file() asin_list.append(self.asin) # best seller badge self.best_seller_badge = "" try: best_seller_badge_id = "BESTSELLER_" + self.asin best_seller_badge = li.find(id=best_seller_badge_id).get_text() best_seller_badge = " ".join(best_seller_badge.split()) self.best_seller_badge = best_seller_badge.replace( "Best Seller", "Best Seller ", 1) print("best_seller_badge:", self.best_seller_badge) except: pass return asin_list
def keyword_to_mw_rank(keyword): try: keyword = keyword.replace(" ", "%20") url = "https://www.merchantwords.com/search/us/" + keyword + "/sort-highest" soup = amazon_module.download_soup_by_url(url) trs = soup.find("table").find("tbody").find_all("tr") node_list = [] for tr in trs: # print(tr.get_text()) try: blurry_words = tr.find("span").get_text() # print(blurry_words) num = tr.find_all("td")[1].get_text() num = num.replace(",", "") # print(num) node = tr.find("small") node = str(node) node = node.replace("<br/>", "; ") node = node.replace("<small>", "") node = node.replace("</small>", "") node = node.replace("&", "&") # print(node) node_tuple = (blurry_words, num, node) node_list.append(node_tuple) except: pass # print(node_list[0][1]) return (node_list[0][1]) except: print("fail to get merchantwords rank!")
def keyword_to_all_listing_asin_list(self): base_url = "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=" first_page_url = base_url + self.keyword pages_urls_list = [] pages_urls_list.append(first_page_url) page = 1 while page <= self.max_page: soup = amazon_module.download_soup_by_url(pages_urls_list[-1]) try: if soup.find(id="pagnNextLink")["href"]: next_page_url_part2 = soup.find(id="pagnNextLink")["href"] next_page_url = "https://www.amazon.com" + next_page_url_part2 pages_urls_list.append(next_page_url) page = page + 1 except: pass try: lis = soup.find_all("li", class_="s-result-item") for index, li in enumerate(lis): try: asin = li["data-asin"] self.asin = asin page_rank = "page" + str(page - 1) + "-" + str(index + 1) print("page_rank: ", page_rank) sponsored_or_natural_rank = "natural_rank" try: if li.find("h5").get_text().strip().split()[0]: if li.find("h5").get_text().strip().split()[0] == "Sponsored": sponsored_or_natural_rank = "sponsored" else: sponsored_or_natural_rank = "natural_rank" except: pass print("sponsored_or_natural_rank: ", sponsored_or_natural_rank) is_prime = "" try: if li.find("i", class_="a-icon-prime"): is_prime = "prime" except: pass print("is_prime: ", is_prime) listing_info_dict = self.asin_to_listing_info() listing_info_dict["page_rank"] = page_rank listing_info_dict["sponsored_or_natural_rank"] = sponsored_or_natural_rank listing_info_dict["is_prime"] = is_prime self.listing_info_dict_list.append(listing_info_dict) try: self.picture_url = listing_info_dict['picture_url'] self.download_picture_by_url() except: pass except: pass except: pass
def __init__(self, asin): self.soup = amazon_module.download_soup_by_url( "https://www.amazon.com/dp/" + asin)
def asin_to_listing_info(asin): print("asin: ", asin) url = "https://www.amazon.com/dp/" + asin soup = amazon_module.download_soup_by_url(url) print(len(soup)) brand = " " try: if soup.find(id="bylineInfo"): brand = soup.find(id="bylineInfo").get_text().strip() if soup.find(id="brand"): brand = soup.find(id="brand").get_text().strip() except: pass print("brand:", brand) badge = "" try: if soup.find("a", class_="badge-link"): badge = " ".join(soup.find("a", class_="badge-link").get_text().strip().split()) except: pass print("badge:", badge) title = "" try: if soup.find(id="productTitle"): title = soup.find(id="productTitle").get_text().strip() except: pass print("title:", title) variation_name = " " try: if soup.find(id="variation_pattern_name"): variation_name = soup.find(id="variation_pattern_name").find("span").get_text().strip() print("variation_pattern_name: ", variation_name) elif soup.find(id="variation_color_name"): variation_name = soup.find(id="variation_color_name").find("span").get_text().strip() print("variation_color_name: ", variation_name) elif soup.find(id="variation_size_name"): variation_name = soup.find(id="variation_size_name").find("span").get_text().strip() print("variation_size_name: ", variation_name) else: print("variation_name: ", variation_name) except: pass price = " " try: if soup.find(id="price"): price = soup.find(id="price").find("span").get_text().strip() if soup.find(id="priceblock_ourprice"): price = soup.find(id="priceblock_ourprice").get_text().strip() except: pass print("price:", price) sold_by = " " try: if soup.find(id="merchant-info"): # print("soup.find(id='merchant-info').get_text().strip(): ", soup.find(id="merchant-info").get_text().strip()) sold_by = " ".join(soup.find(id="merchant-info").get_text().strip().split()) except: pass print("sold_by:", sold_by) how_many_sellers = " " try: if soup.find(id="olp_feature_div"): how_many_sellers = soup.find(id="olp_feature_div").find("a").get_text().strip() except: pass print("how_many_sellers:", how_many_sellers ) bullets_list = [] try: if soup.find("div", id="feature-bullets"): bullets_contents = soup.find("div", id="feature-bullets").find_all("span", class_="a-list-item") print("bullets:") for bullets_content in bullets_contents: print(bullets_content.get_text().strip()) #toys if bullets_content.span: continue bullets_list.append(bullets_content.get_text().strip()) except: pass description = " " try: if soup.find(id="productDescription"): description = soup.find(id="productDescription").get_text() if soup.find(id="aplus"): description = soup.find(id="aplus").get_text() description = " ".join(description.split()) except: pass print("description:", description) salesrank = " " try: if soup.find(id="SalesRank"): salesrank = soup.find(id="SalesRank") salesrank = salesrank.get_text().strip() salesrank = re.search('#(\d|,)+', salesrank) salesrank = salesrank.group() salesrank = salesrank.replace(',', '') salesrank = salesrank.replace('#', '') #toys if soup.find(id="productDetails_detailBullets_sections1"): trs = soup.find(id="productDetails_detailBullets_sections1").find_all("tr") for tr in trs: if tr.find("th").get_text().strip(): if tr.find("th").get_text().strip() == "Best Sellers Rank": salesrank = tr.find("td").get_text().strip() salesrank = re.search('#(\d|,)+', salesrank) salesrank = salesrank.group() salesrank = salesrank.replace(',', '') salesrank = salesrank.replace('#', '') except: pass print("salesrank:", salesrank) review_num = " " try: if soup.find(id="acrCustomerReviewText"): review_num = soup.find(id="acrCustomerReviewText").get_text().split()[0].strip() except: pass print("review_num:", review_num) review_value = " " try: if soup.find(class_="arp-rating-out-of-text"): review_value = soup.find(class_="arp-rating-out-of-text").get_text().strip() review_value = re.search('(.*?)\s', review_value) review_value = review_value.group() review_value = review_value.strip() except: pass print("review_value:", review_value) qa_num = " " try: if soup.find(id="askATFLink"): qa_num = soup.find(id="askATFLink").get_text().split()[0].strip() except: pass print("qa_num:", qa_num) picture_url = " " try: picture_urls_dict = dict() if soup.find("img", id="landingImage"): picture_urls = soup.find("img", id="landingImage")["data-a-dynamic-image"] picture_urls_dict = eval(picture_urls) picture_urls_list = [] for key in picture_urls_dict.keys(): picture_urls_list.append(key) picture_url = picture_urls_list[0] except: pass print("picture_url:", picture_url) listing_info_dict = { "asin": asin, "url": url, "brand": brand, "badge": badge, "title": title, "variation_name": variation_name, "price": price, "sold_by": sold_by, "how_many_sellers": how_many_sellers, "bullets": bullets_list, "description": description, "salesrank": salesrank, "review_num": review_num, "review_value": review_value, "qa_num": qa_num, "picture_url": picture_url } return listing_info_dict
def keyword_to_asin_list(keyword, max_page, table_name, conn): print("keyword_to_asin_list is running...") base_url = "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=" keyword_with_underline = "_".join(keyword.split()) keyword_with_plus = "+".join(keyword.split()) first_page_url = base_url + keyword_with_plus pages_urls_list = [] pages_urls_list.append(first_page_url) page = 1 while page <= max_page: # soup = self.download_soup_by_url(pages_urls_list[-1]) soup = amazon_module.download_soup_by_url(pages_urls_list[-1]) try: if soup.find(id="pagnNextLink")["href"]: next_page_url_part2 = soup.find(id="pagnNextLink")["href"] next_page_url = "https://www.amazon.com" + next_page_url_part2 pages_urls_list.append(next_page_url) page = page + 1 except: pass try: lis = soup.find_all("li", class_="s-result-item") for index, li in enumerate(lis): try: asin = li["data-asin"] page_rank = "page" + str(page - 1) + "-" + str(index + 1) print("page_rank: ", page_rank) sponsored_or_natural_rank = "natural_rank" try: if li.find("h5").get_text().strip().split()[0]: if li.find("h5").get_text().strip().split()[0] == "Sponsored": sponsored_or_natural_rank = "sponsored" else: sponsored_or_natural_rank = "natural_rank" except: pass print("sponsored_or_natural_rank: ", sponsored_or_natural_rank) is_prime = "" try: if li.find("i", class_="a-icon-prime"): is_prime = "prime" except: pass print("is_prime: ", is_prime) listing_info_dict = asin_to_listing_info(asin) listing_info_dict["page_rank"] = page_rank listing_info_dict["sponsored_or_natural_rank"] = sponsored_or_natural_rank listing_info_dict["is_prime"] = is_prime try: insert_data_to_mysql(listing_info_dict, table_name, conn) except: pass try: picture_url = listing_info_dict['picture_url'] picture_folder = keyword_with_underline download_picture_by_url(picture_url, picture_folder, asin) except: print("fail to download picture") except: pass except: pass
def first_review_url_to_review_info(self, url, asin): location = re.search("ref=", url) span = location.span()[0] first_review_url_part1 = url[:span] review_base_url = first_review_url_part1 + "ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&filterByStar=" + self.all_or_positive_or_critical + "&reviewerType=all_reviews&sortBy=" + self.top_or_recent + "&pageNumber=" first_review_url = review_base_url + str(1) first_review_url_soup = amazon_module.download_soup_by_url(first_review_url) last_page = 1 try: last_page = first_review_url_soup.find(id="cm_cr-pagination_bar").find_all("li", class_="page-button")[-1].get_text() except: pass last_page = int(last_page) min_page = min(last_page, self.max_page) for page in range(1, min_page+1): review_url = review_base_url + str(page) try: soup = amazon_module.download_soup_by_url(review_url) review_list = soup.find(id="cm_cr-review_list").find_all("div", {"data-hook":"review"}) self.review_dict_list = [] for review_index, review in enumerate(review_list): review_title = review.find("a", {"data-hook":"review-title"}).get_text() review_star_rating = review.find("i", {"data-hook": "review-star-rating"}).get_text() review_author = review.find("a", {"data-hook": "review-author"}).get_text() review_date = review.find("span", {"data-hook": "review-date"}).get_text() review_body = review.find("span", {"data-hook": "review-body"}).get_text() page_rank = "page" + str(page) + "-" + str(review_index + 1) profile_url_part = review.find("a", {"data-hook": "review-author"})['href'] profile_url = "https://www.amazon.com" + profile_url_part review_bage = "" try: review_bage = review.find("span", {"data-hook": "avp-badge"}).get_text() except: pass review_variation = "" try: review_variation = review.find("a", {"data-hook": "format-strip"}).get_text() except: pass review_dict = { "page_rank": page_rank, "asin": asin, "review_bage": review_bage, "review_variation": review_variation, "review_title": review_title, "review_star_rating": review_star_rating, "review_author": review_author, "review_date": review_date, "review_body": review_body, "profile_url": profile_url, } print(review_dict) self.review_dict_list.append(review_dict) self.dict_list_to_csv_file() except: pass
def asin_to_listing_info(self): print("asin: ", self.asin) url = "https://www.amazon.com/dp/" + self.asin soup = amazon_module.download_soup_by_url(url) brand = " " try: if soup.find(id="bylineInfo"): brand = soup.find(id="bylineInfo").get_text().strip() if soup.find(id="brand"): brand = soup.find(id="brand").get_text().strip() except: pass print("brand:", brand) badge = " " try: if soup.find(id="acBadge_feature_div").find( "div", class_="ac-badge-wrapper"): badge = " ".join( soup.find(id="acBadge_feature_div").find( "div", class_="ac-badge-wrapper").get_text().strip().split()) badge = badge.replace( "Amazon's Choice recommends highly rated, well-priced products available to ship immediately. ", "", 1) except: pass print("badge:", badge) title = " " try: if soup.find(id="productTitle"): title = soup.find(id="productTitle").get_text().strip() except: pass print("title:", title) variation_name = " " try: if soup.find(id="variation_pattern_name"): variation_name = soup.find(id="variation_pattern_name").find( "span").get_text().strip() print("variation_pattern_name: ", variation_name) elif soup.find(id="variation_color_name"): variation_name = soup.find( id="variation_color_name").find("span").get_text().strip() print("variation_color_name: ", variation_name) elif soup.find(id="variation_size_name"): variation_name = soup.find( id="variation_size_name").find("span").get_text().strip() print("variation_size_name: ", variation_name) else: print("variation_name: ", variation_name) variation_name = " ".join(variation_name.split()) except: pass price = " " try: if soup.find(id="price"): price = soup.find(id="price").find("span").get_text().strip() if soup.find(id="priceblock_ourprice"): price = soup.find(id="priceblock_ourprice").get_text().strip() except: pass print("price:", price) sold_by = " " try: if soup.find(id="merchant-info"): sold_by = " ".join( soup.find(id="merchant-info").get_text().strip().split()) except: pass print("sold_by:", sold_by) how_many_sellers = " " try: if soup.find(id="olp_feature_div"): how_many_sellers = soup.find( id="olp_feature_div").find("a").get_text().strip() except: pass print("how_many_sellers:", how_many_sellers) bullets_list = [] try: if soup.find("div", id="feature-bullets"): bullets_contents = soup.find( "div", id="feature-bullets").find_all("span", class_="a-list-item") for bullets_content in bullets_contents: print(bullets_content.get_text().strip()) #toys if bullets_content.span: continue bullets_list.append(bullets_content.get_text().strip()) bullets = bullets_list bullet_1 = " " bullet_2 = " " bullet_3 = " " bullet_4 = " " bullet_5 = " " bullet_6 = " " bullet_7 = " " bullet_8 = " " bullet_9 = " " bullet_10 = " " if bullets: try: bullet_1 = bullets[0] except: pass try: bullet_2 = bullets[1] except: pass try: bullet_3 = bullets[2] except: pass try: bullet_4 = bullets[3] except: pass try: bullet_5 = bullets[4] except: pass try: bullet_6 = bullets[5] except: pass try: bullet_7 = bullets[6] except: pass try: bullet_8 = bullets[7] except: pass try: bullet_9 = bullets[8] except: pass try: bullet_10 = bullets[9] except: pass except: pass print("bullets_list:", bullets_list) a_plus_page = " " try: if soup.find(id="aplus"): a_plus_page = soup.find(id="aplus").get_text() a_plus_page = " ".join(a_plus_page.split()) except: pass a_plus_page = re.sub(r"(Product Description.*; } )", "", a_plus_page) a_plus_page = re.sub(r"(From the manufacturer.*; } )", "", a_plus_page) a_plus_page = a_plus_page.replace("View larger ", "") a_plus_page = a_plus_page.replace("Read more ", "") print("a_plus_page:", a_plus_page) description = " " try: if soup.find(id="productDescription"): description = soup.find(id="productDescription").get_text() description = " ".join(description.split()) except: pass description = re.sub(r"(Product Description.*; } )", "", description) description = re.sub(r"(From the manufacturer.*; } )", "", description) description = description.replace("View larger ", "") description = description.replace("Read more ", "") print("description:", description) salesrank = " " try: if soup.find(id="SalesRank"): salesrank = soup.find(id="SalesRank") salesrank = salesrank.get_text().strip() salesrank = re.search('#(\d|,)+', salesrank) salesrank = salesrank.group() salesrank = salesrank.replace(',', '') salesrank = salesrank.replace('#', '') #toys if soup.find(id="productDetails_detailBullets_sections1"): trs = soup.find( id="productDetails_detailBullets_sections1").find_all("tr") for tr in trs: if tr.find("th").get_text().strip(): if tr.find("th").get_text().strip( ) == "Best Sellers Rank": salesrank = tr.find("td").get_text().strip() salesrank = re.search('#(\d|,)+', salesrank) salesrank = salesrank.group() salesrank = salesrank.replace(',', '') salesrank = salesrank.replace('#', '') except: pass print("salesrank:", salesrank) review_num = " " try: if soup.find(id="acrCustomerReviewText"): review_num = soup.find( id="acrCustomerReviewText").get_text().split()[0].strip() except: pass print("review_num:", review_num) review_value = " " try: if soup.find(class_="arp-rating-out-of-text"): review_value = soup.find( class_="arp-rating-out-of-text").get_text().strip() review_value = re.search('(.*?)\s', review_value) review_value = review_value.group() review_value = review_value.strip() except: pass print("review_value:", review_value) qa_num = " " try: if soup.find(id="askATFLink"): qa_num = soup.find( id="askATFLink").get_text().split()[0].strip() except: pass print("qa_num:", qa_num) picture_url = " " try: picture_urls_dict = dict() if soup.find("img", id="landingImage"): picture_urls = soup.find( "img", id="landingImage")["data-a-dynamic-image"] picture_urls_dict = eval(picture_urls) picture_urls_list = [] for key in picture_urls_dict.keys(): picture_urls_list.append(key) picture_url = picture_urls_list[0] except: pass print("picture_url:", picture_url) self.picture_url = picture_url self.download_picture_by_url() self.listing_info_dict = { "asin": self.asin, "url": url, "brand": brand, "best_seller_badge": self.best_seller_badge, "badge": badge, "title": title, "variation_name": variation_name, "price": price, "sold_by": sold_by, "how_many_sellers": how_many_sellers, "bullet_1": bullet_1, "bullet_2": bullet_2, "bullet_3": bullet_3, "bullet_4": bullet_4, "bullet_5": bullet_5, "bullet_6": bullet_6, "bullet_7": bullet_7, "bullet_8": bullet_8, "bullet_9": bullet_9, "bullet_10": bullet_10, "a_plus_page": a_plus_page, "description": description, "salesrank": salesrank, "review_num": review_num, "review_value": review_value, "qa_num": qa_num, "picture_url": picture_url } return self.listing_info_dict
with open(img_path, 'wb') as fp: fp.write(pic.content) print("SUCCESS to download picture") except requests.exceptions.ConnectionError: print("FAIL to download picture!") # main start_datetime = datetime.now() print("start_datetime:", start_datetime) csv_file_name = str(start_datetime).replace(":", ";").strip().split(".")[0] + ".csv" for asin in asin_list: listing_url = "https://www.amazon.com/dp/" + asin soup = amazon_module.download_soup_by_url(listing_url) start_review_url_part2 = soup.find(id="dp-summary-see-all-reviews")["href"] start_review_url = "https://www.amazon.com" + start_review_url_part2 location = re.search("ref=", start_review_url) span = location.span()[0] start_review_url_part1 = start_review_url[:span] # https://www.amazon.com/Stainless-Steel-Personalized-Tags-Lines/product-reviews/B00BJLS55G/ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=all_reviews&pageNumber=1&sortBy=helpful&filterByStar=five_star&mediaType=media_reviews_only review_base_url = start_review_url_part1 + "ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=" + all_or_VP + "&sortBy=" + top_or_recent + "&filterByStar=" + stars + "&mediaType=" + all_or_media + "&pageNumber=" first_review_url = review_base_url + str(1) for page in range(1, max_page + 1): review_url = review_base_url + str(page) try: dict_list = [] soup = amazon_module.download_soup_by_url(review_url)
from bs4 import BeautifulSoup from amazon_module import amazon_module import re # proxy # SOCKS5 proxy for HTTP/HTTPS proxies = { 'http': 'http://192.168.1.103:48850', 'https': 'https://192.168.1.103:48850', } # headers headers = {} url = 'https://www.amazon.com/dp/B07B8CMX26' soup = amazon_module.download_soup_by_url(url) # results = open('./product-detail-page-one.html', 'r') # r = results.read() # soup = BeautifulSoup(r, 'html.parser') try: # Variations try: lis = soup.find("ul", role="radiogroup").find_all("li") variation_list = [] for li in lis: variation_list.append(li['data-defaultasin']) except Exception as e: print("Analyze Variations Failed!: {}".format(e)) pass
def asin_to_listing_info(asin, index): print("asin: ", asin) url = "https://www.amazon.com/dp/" + asin print("url: ", url) soup = amazon_module.download_soup_by_url(url) brand = " " try: if soup.find(id="bylineInfo"): brand = soup.find(id="bylineInfo").get_text().strip() if soup.find(id="brand"): brand = soup.find(id="brand").get_text().strip() except: pass print("brand: ", brand) badge = " " try: if soup.find("a", class_="badge-link"): badge = " ".join( soup.find("a", class_="badge-link").get_text().strip().split()) except: pass print("badge: ", badge) title = " " try: if soup.find(id="productTitle"): title = soup.find(id="productTitle").get_text().strip() except: pass print("title: ", title) variation_name = " " try: if soup.find(id="variation_pattern_name"): variation_name = soup.find( id="variation_pattern_name").find("span").get_text().strip() print("variation_pattern_name: ", variation_name) elif soup.find(id="variation_color_name"): variation_name = soup.find( id="variation_color_name").find("span").get_text().strip() print("variation_color_name: ", variation_name) elif soup.find(id="variation_size_name"): variation_name = soup.find( id="variation_size_name").find("span").get_text().strip() print("variation_size_name: ", variation_name) else: print("variation_name: ", variation_name) except: pass price = " " sale_price = " " try: if soup.find(id="price"): price = soup.find(id="price").find("span").get_text().strip() if soup.find(id="priceblock_ourprice"): price = soup.find(id="priceblock_ourprice").get_text().strip() if soup.find(id="priceblock_saleprice"): sale_price = soup.find( id="priceblock_saleprice").get_text().strip() except: pass print("price: ", price) print("sale_price: ", sale_price) sold_by = " " try: if soup.find(id="merchant-info"): # print("soup.find(id='merchant-info').get_text().strip(): ", soup.find(id="merchant-info").get_text().strip()) sold_by = " ".join( soup.find(id="merchant-info").get_text().strip().split()) except: pass print("sold_by: ", sold_by) how_many_sellers = " " try: if soup.find(id="olp_feature_div"): how_many_sellers = soup.find( id="olp_feature_div").find("a").get_text().strip() except: pass print("how_many_sellers: ", how_many_sellers) bullets_list = [] bullet_1 = " " bullet_2 = " " bullet_3 = " " bullet_4 = " " bullet_5 = " " try: if soup.find("div", id="feature-bullets"): bullets_contents = soup.find("div", id="feature-bullets").find_all( "span", class_="a-list-item") # print("bullets: ") for bullets_content in bullets_contents: # print(bullets_content.get_text().strip()) #toys if bullets_content.span: continue bullets_list.append(bullets_content.get_text().strip()) except: pass try: bullet_1 = bullets_list[0] bullet_2 = bullets_list[1] bullet_3 = bullets_list[2] bullet_4 = bullets_list[3] bullet_5 = bullets_list[4] except: pass print("bullet_1: ", bullet_1) print("bullet_2: ", bullet_2) print("bullet_3: ", bullet_3) print("bullet_4: ", bullet_4) print("bullet_5: ", bullet_5) description = " " try: if soup.find(id="productDescription"): description = soup.find(id="productDescription").get_text() if soup.find(id="aplus"): description = soup.find( id="aplus").find("div").find_all("div").get_text() description = re.search(r".aplus-v2(.*)\}(.*)", description) description = description.group(1) description = description.strip() description = " ".join(description.split()) except: pass print("description: ", description) salesrank = " " # try: # if soup.find(id="SalesRank"): # salesrank = soup.find(id="SalesRank") # salesrank = salesrank.get_text().strip() # salesrank = re.search('#(\d|,)+', salesrank) # salesrank = salesrank.group() # salesrank = salesrank.replace(',', '') # salesrank = salesrank.replace('#', '') # #toys # if soup.find(id="productDetails_detailBullets_sections1"): # trs = soup.find(id="productDetails_detailBullets_sections1").find_all("tr") # for tr in trs: # if tr.find("th").get_text().strip(): # if tr.find("th").get_text().strip() == "Best Sellers Rank": # salesrank = tr.find("td").get_text().strip() # salesrank = re.search('#(\d|,)+', salesrank) # salesrank = salesrank.group() # salesrank = salesrank.replace(',', '') # salesrank = salesrank.replace('#', '') # except: # pass # print("salesrank: ", salesrank) salesrank_1 = " " salesrank_2 = " " salesrank_3 = " " salesrank_node_1 = " " salesrank_node_2 = " " salesrank_node_3 = " " try: salesrank_1 = soup.find(id="SalesRank") salesrank_1 = salesrank_1.get_text().strip() salesrank_1 = re.search('#(\d|,)+', salesrank_1) salesrank_1 = salesrank_1.group() salesrank_1 = salesrank_1.replace(',', '') salesrank_1 = salesrank_1.replace('#', '') # print(salesrank_1) salesrank_node_1 = soup.find(id="SalesRank") salesrank_node_1 = salesrank_node_1.get_text().strip() salesrank_node_1 = re.search(r"in(.*?)\(", salesrank_node_1) salesrank_node_1 = salesrank_node_1.group() salesrank_node_1 = salesrank_node_1.replace("in ", "") salesrank_node_1 = salesrank_node_1.replace(" (", "") salesrank_node_1 = salesrank_node_1.strip() # print(salesrank_node_1) try: lis = soup.find(id="SalesRank").find( "ul", class_="zg_hrsr").find_all("li") node_salesrank_list = [] node_name_list = [] for li in lis: node_salesrank = li.get_text().strip() node_salesrank = re.search('#(\d|,)+', node_salesrank) node_salesrank = node_salesrank.group() node_salesrank = node_salesrank.replace(',', '') node_salesrank = node_salesrank.replace('#', '') node_salesrank_list.append(node_salesrank) node_name = li.get_text().strip() node_name = re.search(r"in(.*)", node_name) node_name = node_name.group() node_name = node_name.replace("in\xa0", "") node_name = node_name.strip() node_name_list.append(node_name) # print(node_salesrank_list) # print(node_name_list) if len(node_salesrank_list) == 1: salesrank_2 = node_salesrank_list[0] if len(node_salesrank_list) == 2: salesrank_2 = node_salesrank_list[0] salesrank_3 = node_salesrank_list[1] if len(node_salesrank_list) == 1: salesrank_node_2 = node_name_list[0] if len(node_salesrank_list) == 2: salesrank_node_2 = node_name_list[0] salesrank_node_3 = node_name_list[1] except: pass except: pass print("salesrank_1: ", salesrank_1, " ", "salesrank_node_1: ", salesrank_node_1) print("salesrank_2: ", salesrank_2, " ", "salesrank_node_2: ", salesrank_node_2) print("salesrank_3: ", salesrank_3, " ", "salesrank_node_3: ", salesrank_node_3) review_num = " " try: if soup.find(id="acrCustomerReviewText"): review_num = soup.find( id="acrCustomerReviewText").get_text().split()[0].strip() except: pass print("review_num: ", review_num) review_value = " " try: if soup.find(class_="arp-rating-out-of-text"): review_value = soup.find( class_="arp-rating-out-of-text").get_text().strip() review_value = re.search('(.*?)\s', review_value) review_value = review_value.group() review_value = review_value.strip() except: pass print("review_value: ", review_value) qa_num = " " try: if soup.find(id="askATFLink"): qa_num = soup.find(id="askATFLink").get_text().split()[0].strip() except: pass print("qa_num: ", qa_num) picture_url = " " try: picture_urls_dict = dict() if soup.find("img", id="landingImage"): picture_urls = soup.find("img", id="landingImage")["data-a-dynamic-image"] picture_urls_dict = eval(picture_urls) picture_urls_list = [] for key in picture_urls_dict.keys(): picture_urls_list.append(key) picture_url = picture_urls_list[0] except: pass print("picture_url: ", picture_url) listing_info_dict = { "asin": asin, "url": url, "brand": brand, "badge": badge, "title": title, "variation_name": variation_name, "price": price, "sale_price": sale_price, "sold_by": sold_by, "how_many_sellers": how_many_sellers, # "bullets": bullets_list, "bullet_1": bullet_1, "bullet_2": bullet_2, "bullet_3": bullet_3, "bullet_4": bullet_4, "bullet_5": bullet_5, "description": description, # "salesrank": salesrank, "salesrank_1": salesrank_1, "salesrank_node_1": salesrank_node_1, "salesrank_2": salesrank_2, "salesrank_node_2": salesrank_node_2, "salesrank_3": salesrank_3, "salesrank_node_3": salesrank_node_3, "review_num": review_num, "review_value": review_value, "qa_num": qa_num, "picture_url": picture_url } # return listing_info_dict dict_list_to_csv_file(listing_info_dict, index) try: download_picture_by_url(asin, picture_url) except: pass
# ! /usr/bin/env python # -*- coding:utf-8 -*- from amazon_module import amazon_module import re import os from bs4 import BeautifulSoup import requests import csv import openpyxl node_dict = {} node_url_dict = {} top100_url = "https://www.amazon.com/Best-Sellers/zgbs/ref=zg_bs_unv_e_0_e_1" soup = amazon_module.download_soup_by_url(top100_url) lis = soup.find(id="zg_browseRoot").find("ul").find_all("li") # for temp_li_index, temp_li in enumerate(lis): # print(temp_li_index, temp_li.get_text()) # 0 Amazon Devices & Accessories # 1 Amazon Launchpad # 2 Appliances # 3 Apps & Games # 4 Arts, Crafts & Sewing # 5 Automotive # 6 Baby # 7 Beauty & Personal Care # 8 Books # 9 CDs & Vinyl
def asin_to_listing_info(asin, country=None): now = int(time.time()) # print("asin: ", asin) url = "https://www.amazon.com/dp/" + asin if country: url = "https://www.amazon.co.uk/dp/" + asin soup = amazon_module.download_soup_by_url(url) brand = " " brand_url = "" try: if soup.find(id="bylineInfo"): brand = soup.find(id="bylineInfo").get_text().strip() brand_url = soup.find(id="bylineInfo")["href"] if soup.find(id="brand"): brand = soup.find(id="brand").get_text().strip() except: pass title = "" try: if soup.find(id="productTitle"): title = soup.find(id="productTitle").get_text().strip() except: pass variation_name = " " try: if soup.find(id="variation_pattern_name"): variation_name = soup.find( id="variation_pattern_name").find("span").get_text().strip() elif soup.find(id="variation_color_name"): variation_name = soup.find( id="variation_color_name").find("span").get_text().strip() elif soup.find(id="variation_size_name"): variation_name = soup.find( id="variation_size_name").find("span").get_text().strip() except: pass price = 0.0 try: if soup.find(id="price"): price = soup.find(id="price").find("span").get_text() price = re.search('(\d*\.\d*)', price) price = price.group() if soup.find(id="priceblock_ourprice"): price = soup.find(id="priceblock_ourprice").get_text() price = re.search('(\d*\.\d*)', price) price = price.group() except: pass sold_by = " " try: if soup.find(id="merchant-info"): sold_by = " ".join( soup.find(id="merchant-info").get_text().strip().split()) except: pass availability = "" try: if soup.find(id="availability"): availability = soup.find( id="availability").find("span").get_text().strip() except: pass aplus = "" try: if soup.find(id="aplus"): aplus = soup.find(id="aplus").find("h2").get_text().strip() except: pass ranking_list = [] offering_list = [] spans_text = "" review_dict_list = [] review_last_desc = "" review_last_time = 0 review_last_unit = "" # Salesrank try: trs = soup.find(id="productDetails_detailBullets_sections1") if trs: trs = trs.find_all("tr") for tr in trs: try: th = tr.find("th").get_text().strip() if th == "Best Sellers Rank": spans = tr.find("span").find_all("span") num = 0 for span in spans: try: span_text = span.get_text() spans_text = spans_text + span_text + "\n" ranking = re.search('#(\d|,)+', span_text) ranking = ranking.group() ranking = ranking.replace(',', '') ranking = ranking.replace('#', '') rank_text_arr = span_text.split(' in ') rank_text = rank_text_arr[1] rank_text_arr = rank_text.split('(') rank_text = rank_text_arr[0] num = num + 1 rank_dict = { "detail_id": now, "rank_num": num, "rank_asin": asin, "rank_order": ranking, "rank_text": rank_text, } ranking_list.append(rank_dict) except Exception as e: print("Handling Salesrank string errors !: {}". format(e)) pass except Exception as e: print("Analyze Salesrank th errors!: {}".format(e)) pass except Exception as e: print("Analyze Salesrank errors!: {}".format(e)) pass review_num = 0 try: if soup.find(id="acrCustomerReviewText"): review_num = soup.find(id="acrCustomerReviewText").get_text( ).split()[0].strip(",").strip().replace(',', '') except: pass review_value = 0.0 try: if soup.find(class_="arp-rating-out-of-text"): review_value = soup.find( class_="arp-rating-out-of-text").get_text().strip() review_value = re.search('(.*?)\s', review_value) review_value = review_value.group() review_value = review_value.strip() except: pass qa_num = 0 try: if soup.find(id="askATFLink"): qa_num = soup.find(id="askATFLink").get_text().split()[0].strip() except: pass try: review_list = soup.find(id="most-recent-reviews-content") if review_list and review_list.find_all( "div", {"data-hook": "recent-review"}): review_list = review_list.find_all("div", {"data-hook": "recent-review"}) for review_index, review in enumerate(review_list): review_title = review.find("span", { "data-hook": "review-title-recent" }).get_text() review_star_rating = review.find( "i", { "data-hook": "review-star-rating-recent" }).get_text() review_author_url = review.find("a", {"class": "a-profile"})["href"] review_author = review.find("span", { "class": "a-profile-name" }).get_text() review_date_desc = review.find( "span", { "data-hook": "review-author-timestamp" }).get_text() review_body = review.find("span", { "data-hook": "review-body-recent" }).get_text() review_date_desc_temp = review_date_desc.lstrip( 'Published ').rstrip(' ago') if 'on ' in review_date_desc_temp: review_date = '2' review_date_unit = 'year' else: review_date_desc_arr = review_date_desc_temp.split(' ') review_date = review_date_desc_arr[0] review_date_unit = review_date_desc_arr[1].rstrip('s') #TODO: if review_index == 0: review_last_desc = review_date_desc review_last_time = review_date review_last_unit = review_date_unit review_dict = { "review_asin": asin, "review_title": review_title, "review_star": review_star_rating.rstrip(" out of 5 stars"), "review_author": review_author, "review_author_url": review_author_url, "review_date": review_date, "review_date_unit": review_date_unit, "review_date_desc": review_date_desc, "review_body": review_body, } review_dict_list.append(review_dict) except Exception as e: print "analyze review errors:{}".format(e) pass # follow_sell how_many_sellers = "" follow_type = "" follow_num = 0 buy_money = 0.0 try: olp_feature_div = soup.find(id="olp_feature_div") if olp_feature_div and olp_feature_div.find("a"): how_many_sellers = olp_feature_div.find("a").get_text().strip() if country: follow_sell = how_many_sellers.split() follow_num = follow_sell[0].strip() follow_type = follow_sell[1].strip() else: follow_sell = how_many_sellers.split('(') follow_type = follow_sell[0].strip() follow_num = follow_sell[1].split(')') buy_money = follow_num[1].split('$') buy_money = buy_money[1].strip().replace(',', '').rstrip(' +') follow_num = follow_num[0].strip() except Exception as e: print("Handling follow_sell errors !: {}".format(e)) pass listing_info_dict = { "id": now, "asin": asin, "url": url, "brand": brand, "brand_url": brand_url, "title": title, "variation_name": variation_name, "availability": availability, "price": price, "sold_by": sold_by, "how_many_sellers": how_many_sellers, "follow_type": follow_type, "follow_num": follow_num, "buy_money": buy_money, "review_num": review_num, "review_value": review_value, "review_last_time": review_last_time, "review_last_unit": review_last_unit, "review_last_desc": review_last_desc, "spans_text": spans_text, "qa_num": qa_num, "aplus": aplus, } return listing_info_dict, ranking_list, offering_list, review_dict_list