Beispiel #1
0
 def asin_to_first_review_url(self, asin):
     listing_url = "https://www.amazon.com/dp/" + asin
     soup = amazon_module.download_soup_by_url(listing_url)
     first_review_url_part2 = soup.find(
         id="dp-summary-see-all-reviews")["href"]
     first_review_url = "https://www.amazon.com" + first_review_url_part2
     return first_review_url
def keyword_to_long_tail_keyword_list(keyword):
    try:
        print("keyword:", keyword)
        # ------这样的横线对应搜索词中字符的数量,如果-有多,可能是搜索词前后有空格
        print("-" * (len("keyword: ") + len(keyword)))
        url_head = "https://completion.amazon.com/search/complete?method=completion&mkt=1&r=Y5KKREBZPVVDRZT19HX9&s=133-8959284-8300960&c=&p=Gateway&l=en_US&b2b=0&fresh=0&sv=desktop&client=amazon-search-ui&x=String&search-alias=aps&q="
        url_tail = "&qs=&cf=1&fb=1&sc=1&"
        try:
            keyword = keyword.replace(" ", "%20")
            keyword = keyword.replace("'", "%27")
            url = url_head + keyword + url_tail

            soup = amazon_module.download_soup_by_url(url)

            soup_string = soup.get_text()
            soup_string = soup_string[13:-11]
            soup_list = eval(soup_string)

            long_tail_keyword_list = []
            for long_tail_keyword in soup_list[1]:
                print(long_tail_keyword)
                long_tail_keyword_list.append(long_tail_keyword)

            print("")
            return(long_tail_keyword_list)
        except:
            print("can't find long tail words")
    except:
        print("can't find long tail words")
def keyword_to_long_tail_keyword_list(keyword):
    try:
        # print("keyword:", keyword)
        # print("-" * (len("keyword: ") + len(keyword)))
        keyword = keyword.replace(" ", "%20")
        keyword = keyword.replace("'", "%27")
        if keyword[0] == "*":
            url_head = "https://completion.amazon.com/search/complete?method=completion&mkt=1&r=X8QW0QJV6AP2J4TJAZM4&s=140-5560419-0294343&c=&p=Gateway&l=en_US&b2b=0&fresh=0&sv=desktop&client=amazon-search-ui&x=String&search-alias=aps&ks=8&q=*&qs="
            url_tail = "&cf=1&fb=1&sc=1&"
        else:
            url_head = "https://completion.amazon.com/search/complete?method=completion&mkt=1&r=X8QW0QJV6AP2J4TJAZM4&s=140-5560419-0294343&c=&p=Gateway&l=en_US&b2b=0&fresh=0&sv=desktop&client=amazon-search-ui&x=String&search-alias=aps&ks=8&q="
            url_tail = "&cf=1&fb=1&sc=1&"

        url = url_head + keyword + url_tail
        soup = amazon_module.download_soup_by_url(url)

        soup_string = soup.get_text()
        soup_string = soup_string[13:-11]
        soup_list = eval(soup_string)

        long_tail_keyword_list = []
        for long_tail_keyword in soup_list[1]:
            # print(long_tail_keyword)
            long_tail_keyword_list.append(long_tail_keyword)
        return (long_tail_keyword_list)
    except:
        print("can't find long tail words")
Beispiel #4
0
    def keyword_to_asin_list(self):
        print("brand_to_asin_list is running...")
        brands = open('./brands', 'r')
        brand_list = []
        for brand in brands:
            try:
                base_url = "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords="
                keyword_with_plus = "+".join(brand.split())
                first_page_url = base_url + keyword_with_plus
                get_url_sleep_time = random.randint(0, 5)
                soup = amazon_module.download_soup_by_url(first_page_url)
                tag_content = soup.get_text().encode("utf-8")
                if re.search(r"No results for", tag_content):
                    self.listing_info_dict["brand"] = brand
                    self.listing_info_dict["serch_url"] = first_page_url
                    self.listing_info_dict["state"] = 'no results'
                else:
                    self.listing_info_dict["brand"] = brand
                    self.listing_info_dict["serch_url"] = first_page_url
                    self.listing_info_dict["state"] = 'yes'
                try:
                    self.listing_info_dict_to_csv_file()
                except:
                    pass
                time.sleep(get_url_sleep_time)

            except Exception as e:
                print("{}".format(e))
    def keyword_to_asin_list(self):
        try:
            print("Start running, may take a few minutes")
            base_url = "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords="
            keyword_with_plus = "+".join(self.keyword.split())
            first_page_url = base_url + keyword_with_plus
            soup = amazon_module.download_soup_by_url(first_page_url)

            temp_others_asin_list = []
            try:
                lis = soup.find_all("li", class_="s-result-item")
                for li in lis:
                    try:
                        asin = li["data-asin"]
                        temp_others_asin_list.append(asin)
                    except:
                        pass

                # remove duplicate asin
                no_repeat_others_asin_list = []
                for asin in temp_others_asin_list:
                    if asin not in no_repeat_others_asin_list:
                        no_repeat_others_asin_list.append(asin)
                return no_repeat_others_asin_list
            except:
                pass
        except:
            pass
 def asin_to_sponsored_asins(self):
     url = "https://www.amazon.com/dp/" + self.others_asin
     lis = amazon_module.download_soup_by_url(url).find(
         id="sp_detail").find("ol").find_all("li")
     sponsored_asin_list = []
     for li in lis:
         sponsored_asin = li.find("div")["data-asin"]
         sponsored_asin_list.append(sponsored_asin)
     return sponsored_asin_list
def asin_to_size_weight(asin):
    # asin = "B075YV1BT8"
    url_head = "https://sellercentral.amazon.com/fba/profitabilitycalculator/productmatches?searchKey="
    url_tail = "&language=en_US&profitcalcToken=p9FcdMuSse7SGDBTzP9EgOn9nuQj3D"
    url = url_head + asin + url_tail

    soup = amazon_module.download_soup_by_url(url)
    soup_text = soup.get_text()
    soup_dict = json.loads(soup_text)
    item_info_dict = soup_dict['data'][0]
    # for k, v in item_info.items():
    #     print(k, ":", v)
    return item_info_dict
 def storefront_url_to_store_url_list(self):
     store_url_list = []
     store_url_list.append(self.storefront_url)
     page = self.max_page
     if page == 1:
         return store_url_list
     while page > 1:
         soup = amazon_module.download_soup_by_url(store_url_list[-1])
         try:
             if soup.find(id="pagnNextLink")['href']:
                 next_page_url_part2 = soup.find(id="pagnNextLink")['href']
                 next_page_url = "https://www.amazon.com" + next_page_url_part2
                 store_url_list.append(next_page_url)
         except:
             return store_url_list
         page = page - 1
     return store_url_list
def page_url_to_asin_list(url):
    asin_list = []
    try:
        page_soup = amazon_module.download_soup_by_url(url)
        page_lis = page_soup.find(id="zg-ordered-list").find_all(
            "li", class_="zg-item-immersion")
        for page_li_index, page_li in enumerate(page_lis):
            try:
                link = page_li.find("a")['href']
                asin = re.findall(r"dp/(.*?)/ref", link)[0]
                badge = page_li.find("span", class_="zg-badge-text").get_text()
                print(page_li_index, asin, badge)
                asin_list.append(asin + badge)
            except:
                pass
        return asin_list
    except:
        pass
def asin_to_offer_listing(asin, now, country=None):
    print("offer asin: ", asin)
    url = "https://www.amazon.com/gp/offer-listing/" + asin + "/dp_olp_new_mbc?ie=UTF8&condition=new"
    if country:
        url = "https://www.amazon.co.uk/gp/offer-listing/" + asin + "/dp_olp_new_mbc?ie=UTF8&condition=new"
    soup = amazon_module.download_soup_by_url(url)
    offering_list = []
    try:
        if soup.find("div", id="olpOfferList"):
            divs = soup.find("div", id="olpOfferList").find_all(
                "div", class_="a-row a-spacing-mini olpOffer")
            num = 0
            for div in divs:
                store_name = "Amazon"
                store_url = "https://www.amazon.com"
                store_price = ""
                span_name = div.find("span",
                                     class_="a-size-medium a-text-bold")
                if span_name:
                    store_name = span_name.get_text().strip()
                    store_url = store_url + span_name.find("a")['href']
                span_price = div.find(
                    "span",
                    class_=
                    "a-size-large a-color-price olpOfferPrice a-text-bold")
                if span_price:
                    store_price = span_price.get_text().strip()
                    store_price = re.search('(\d*\.\d*)', store_price)
                    store_price = store_price.group()
                num = num + 1
                offer_dict = {
                    "detail_id": now,
                    "offer_num": num,
                    "offer_asin": asin,
                    "offer_name": store_name,
                    "offer_price": store_price,
                    "offer_url": store_url,
                }
                offering_list.append(offer_dict)

    except Exception as e:
        print("fail to asin_to_offer_listing: {}".format(e))
        pass
    return offering_list
def keyword_to_amz_rlt(keyword):
    try:
        # print("keyword:", keyword)
        url_head = "https://www.amazon.com/s/ref=nb_sb_noss/147-7192934-0083761?url=search-alias%3Daps&field-keywords="
        url = url_head + keyword

        soup = amazon_module.download_soup_by_url(url)

        results = soup.find(id="s-result-count").get_text()
        results_text = results.split(":")[0]
        # print(results_text)

        m = re.search(r"of (.*?) results", results)
        results = m.group()
        results = results.replace("of ", "").replace(" results",
                                                     "").replace(",", "")
        # print(results)
        return (results)
    except:
        print("fail to find results")
    def store_url_to_asin_list(self, store_url):
        soup = amazon_module.download_soup_by_url(store_url)
        # lis = soup.find(id="s-results-list-atf").find_all("li")
        lis = soup.find_all("li", class_="celwidget")
        asin_list = []
        for index, li in enumerate(lis):
            self.asin = li["data-asin"]
            self.listing_info_dict = self.asin_to_listing_info()
            self.listing_info_dict_to_csv_file()
            asin_list.append(self.asin)

            # best seller badge
            self.best_seller_badge = ""
            try:
                best_seller_badge_id = "BESTSELLER_" + self.asin
                best_seller_badge = li.find(id=best_seller_badge_id).get_text()
                best_seller_badge = " ".join(best_seller_badge.split())
                self.best_seller_badge = best_seller_badge.replace(
                    "Best Seller", "Best Seller ", 1)
                print("best_seller_badge:", self.best_seller_badge)
            except:
                pass
        return asin_list
def keyword_to_mw_rank(keyword):
    try:
        keyword = keyword.replace(" ", "%20")
        url = "https://www.merchantwords.com/search/us/" + keyword + "/sort-highest"
        soup = amazon_module.download_soup_by_url(url)

        trs = soup.find("table").find("tbody").find_all("tr")

        node_list = []
        for tr in trs:
            # print(tr.get_text())
            try:
                blurry_words = tr.find("span").get_text()
                # print(blurry_words)

                num = tr.find_all("td")[1].get_text()
                num = num.replace(",", "")
                # print(num)

                node = tr.find("small")
                node = str(node)
                node = node.replace("<br/>", "; ")
                node = node.replace("<small>", "")
                node = node.replace("</small>", "")
                node = node.replace("&amp;", "&")
                # print(node)

                node_tuple = (blurry_words, num, node)
                node_list.append(node_tuple)
            except:
                pass

        # print(node_list[0][1])
        return (node_list[0][1])
    except:
        print("fail to get merchantwords rank!")
    def keyword_to_all_listing_asin_list(self):
        base_url = "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords="
        first_page_url = base_url + self.keyword

        pages_urls_list = []
        pages_urls_list.append(first_page_url)
        page = 1
        while page <= self.max_page:
            soup = amazon_module.download_soup_by_url(pages_urls_list[-1])
            try:
                if soup.find(id="pagnNextLink")["href"]:
                    next_page_url_part2 = soup.find(id="pagnNextLink")["href"]
                    next_page_url = "https://www.amazon.com" + next_page_url_part2
                    pages_urls_list.append(next_page_url)
                page = page + 1
            except:
                pass

            try:
                lis = soup.find_all("li", class_="s-result-item")
                for index, li in enumerate(lis):
                    try:
                        asin = li["data-asin"]
                        self.asin = asin

                        page_rank = "page" + str(page - 1) + "-" + str(index + 1)
                        print("page_rank: ", page_rank)

                        sponsored_or_natural_rank = "natural_rank"
                        try:
                            if li.find("h5").get_text().strip().split()[0]:
                                if li.find("h5").get_text().strip().split()[0] == "Sponsored":
                                    sponsored_or_natural_rank = "sponsored"
                                else:
                                    sponsored_or_natural_rank = "natural_rank"
                        except:
                            pass
                        print("sponsored_or_natural_rank: ", sponsored_or_natural_rank)

                        is_prime = ""
                        try:
                            if li.find("i", class_="a-icon-prime"):
                                is_prime  = "prime"
                        except:
                            pass
                        print("is_prime: ", is_prime)

                        listing_info_dict = self.asin_to_listing_info()
                        listing_info_dict["page_rank"] = page_rank
                        listing_info_dict["sponsored_or_natural_rank"] = sponsored_or_natural_rank
                        listing_info_dict["is_prime"] = is_prime
                        self.listing_info_dict_list.append(listing_info_dict)
                        try:
                            self.picture_url = listing_info_dict['picture_url']
                            self.download_picture_by_url()
                        except:
                            pass
                    except:
                        pass
            except:
                pass
Beispiel #15
0
 def __init__(self, asin):
     self.soup = amazon_module.download_soup_by_url(
         "https://www.amazon.com/dp/" + asin)
Beispiel #16
0
def asin_to_listing_info(asin):
        print("asin: ", asin)
        url = "https://www.amazon.com/dp/" + asin
        soup = amazon_module.download_soup_by_url(url)
        print(len(soup))

        brand = " "
        try:
            if soup.find(id="bylineInfo"):
                brand = soup.find(id="bylineInfo").get_text().strip()
            if soup.find(id="brand"):
                brand = soup.find(id="brand").get_text().strip()
        except:
            pass
        print("brand:", brand)

        badge = ""
        try:
            if soup.find("a", class_="badge-link"):
               badge = " ".join(soup.find("a", class_="badge-link").get_text().strip().split())
        except:
            pass
        print("badge:", badge)

        title = ""
        try:
            if soup.find(id="productTitle"):
                title = soup.find(id="productTitle").get_text().strip()
        except:
            pass
        print("title:", title)

        variation_name = " "
        try:
            if soup.find(id="variation_pattern_name"):
                variation_name = soup.find(id="variation_pattern_name").find("span").get_text().strip()
                print("variation_pattern_name: ", variation_name)
            elif soup.find(id="variation_color_name"):
                variation_name = soup.find(id="variation_color_name").find("span").get_text().strip()
                print("variation_color_name: ", variation_name)
            elif soup.find(id="variation_size_name"):
                variation_name = soup.find(id="variation_size_name").find("span").get_text().strip()
                print("variation_size_name: ", variation_name)
            else:
                print("variation_name: ", variation_name)
        except:
            pass

        price = " "
        try:
            if soup.find(id="price"):
                price = soup.find(id="price").find("span").get_text().strip()
            if soup.find(id="priceblock_ourprice"):
                price = soup.find(id="priceblock_ourprice").get_text().strip()
        except:
            pass
        print("price:", price)

        sold_by = " "
        try:
            if soup.find(id="merchant-info"):
                # print("soup.find(id='merchant-info').get_text().strip(): ", soup.find(id="merchant-info").get_text().strip())
                sold_by = " ".join(soup.find(id="merchant-info").get_text().strip().split())
        except:
            pass
        print("sold_by:", sold_by)

        how_many_sellers = " "
        try:
            if soup.find(id="olp_feature_div"):
                how_many_sellers = soup.find(id="olp_feature_div").find("a").get_text().strip()
        except:
            pass
        print("how_many_sellers:", how_many_sellers )

        bullets_list = []
        try:
            if  soup.find("div", id="feature-bullets"):
                bullets_contents = soup.find("div", id="feature-bullets").find_all("span", class_="a-list-item")
                print("bullets:")
                for bullets_content in bullets_contents:
                    print(bullets_content.get_text().strip())
                    #toys
                    if bullets_content.span:
                        continue
                    bullets_list.append(bullets_content.get_text().strip())
        except:
            pass

        description = " "
        try:
            if soup.find(id="productDescription"):
                description = soup.find(id="productDescription").get_text()
            if soup.find(id="aplus"):
                description = soup.find(id="aplus").get_text()
            description = " ".join(description.split())
        except:
            pass
        print("description:", description)

        salesrank = " "
        try:
            if soup.find(id="SalesRank"):
                salesrank = soup.find(id="SalesRank")
                salesrank = salesrank.get_text().strip()
                salesrank = re.search('#(\d|,)+', salesrank)
                salesrank = salesrank.group()
                salesrank = salesrank.replace(',', '')
                salesrank = salesrank.replace('#', '')
            #toys
            if soup.find(id="productDetails_detailBullets_sections1"):
                trs = soup.find(id="productDetails_detailBullets_sections1").find_all("tr")
                for tr in trs:
                    if tr.find("th").get_text().strip():
                        if tr.find("th").get_text().strip() == "Best Sellers Rank":
                            salesrank = tr.find("td").get_text().strip()
                            salesrank = re.search('#(\d|,)+', salesrank)
                            salesrank = salesrank.group()
                            salesrank = salesrank.replace(',', '')
                            salesrank = salesrank.replace('#', '')
        except:
            pass
        print("salesrank:", salesrank)

        review_num = " "
        try:
            if soup.find(id="acrCustomerReviewText"):
                review_num = soup.find(id="acrCustomerReviewText").get_text().split()[0].strip()
        except:
            pass
        print("review_num:", review_num)

        review_value = " "
        try:
            if soup.find(class_="arp-rating-out-of-text"):
                review_value = soup.find(class_="arp-rating-out-of-text").get_text().strip()
                review_value = re.search('(.*?)\s', review_value)
                review_value = review_value.group()
                review_value = review_value.strip()
        except:
            pass
        print("review_value:", review_value)

        qa_num = " "
        try:
            if soup.find(id="askATFLink"):
                qa_num = soup.find(id="askATFLink").get_text().split()[0].strip()
        except:
            pass
        print("qa_num:", qa_num)

        picture_url = " "
        try:
            picture_urls_dict = dict()
            if soup.find("img", id="landingImage"):
                picture_urls = soup.find("img", id="landingImage")["data-a-dynamic-image"]
                picture_urls_dict = eval(picture_urls)
            picture_urls_list = []
            for key in picture_urls_dict.keys():
                picture_urls_list.append(key)
            picture_url = picture_urls_list[0]
        except:
            pass
        print("picture_url:", picture_url)

        listing_info_dict = {
                             "asin": asin,
                             "url": url,
                             "brand": brand,
                             "badge": badge,
                             "title": title,
                             "variation_name": variation_name,
                             "price": price,
                             "sold_by": sold_by,
                             "how_many_sellers": how_many_sellers,
                             "bullets": bullets_list,
                             "description": description,
                             "salesrank": salesrank,
                             "review_num": review_num,
                             "review_value": review_value,
                             "qa_num": qa_num,
                             "picture_url": picture_url
                             }

        return listing_info_dict
Beispiel #17
0
def keyword_to_asin_list(keyword, max_page, table_name, conn):
    print("keyword_to_asin_list is running...")
    base_url = "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords="
    keyword_with_underline = "_".join(keyword.split())
    keyword_with_plus = "+".join(keyword.split())
    first_page_url = base_url + keyword_with_plus

    pages_urls_list = []
    pages_urls_list.append(first_page_url)
    page = 1
    while page <= max_page:
        # soup = self.download_soup_by_url(pages_urls_list[-1])
        soup = amazon_module.download_soup_by_url(pages_urls_list[-1])
        try:
            if soup.find(id="pagnNextLink")["href"]:
                next_page_url_part2 = soup.find(id="pagnNextLink")["href"]
                next_page_url = "https://www.amazon.com" + next_page_url_part2
                pages_urls_list.append(next_page_url)
            page = page + 1
        except:
            pass

        try:
            lis = soup.find_all("li", class_="s-result-item")
            for index, li in enumerate(lis):
                try:
                    asin = li["data-asin"]

                    page_rank = "page" + str(page - 1) + "-" + str(index + 1)
                    print("page_rank: ", page_rank)

                    sponsored_or_natural_rank = "natural_rank"
                    try:
                        if li.find("h5").get_text().strip().split()[0]:
                            if li.find("h5").get_text().strip().split()[0] == "Sponsored":
                                sponsored_or_natural_rank = "sponsored"
                            else:
                                sponsored_or_natural_rank = "natural_rank"
                    except:
                        pass
                    print("sponsored_or_natural_rank: ", sponsored_or_natural_rank)

                    is_prime = ""
                    try:
                        if li.find("i", class_="a-icon-prime"):
                            is_prime  = "prime"
                    except:
                        pass
                    print("is_prime: ", is_prime)

                    listing_info_dict = asin_to_listing_info(asin)
                    listing_info_dict["page_rank"] = page_rank
                    listing_info_dict["sponsored_or_natural_rank"] = sponsored_or_natural_rank
                    listing_info_dict["is_prime"] = is_prime

                    try:
                        insert_data_to_mysql(listing_info_dict, table_name, conn)
                    except:
                        pass

                    try:
                        picture_url = listing_info_dict['picture_url']
                        picture_folder = keyword_with_underline
                        download_picture_by_url(picture_url, picture_folder, asin)
                    except:
                        print("fail to download picture")
                except:
                    pass
        except:
            pass
Beispiel #18
0
    def first_review_url_to_review_info(self, url, asin):
        location = re.search("ref=", url)
        span = location.span()[0]
        first_review_url_part1 = url[:span]

        review_base_url = first_review_url_part1 + "ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&filterByStar=" + self.all_or_positive_or_critical + "&reviewerType=all_reviews&sortBy=" + self.top_or_recent + "&pageNumber="
        first_review_url = review_base_url + str(1)
        first_review_url_soup = amazon_module.download_soup_by_url(first_review_url)

        last_page = 1
        try:
            last_page = first_review_url_soup.find(id="cm_cr-pagination_bar").find_all("li", class_="page-button")[-1].get_text()
        except:
            pass
        last_page = int(last_page)
        min_page = min(last_page, self.max_page)

        for page in range(1, min_page+1):
            review_url = review_base_url + str(page)
            try:
                soup = amazon_module.download_soup_by_url(review_url)
                review_list = soup.find(id="cm_cr-review_list").find_all("div", {"data-hook":"review"})

                self.review_dict_list = []
                for review_index, review in enumerate(review_list):
                    review_title = review.find("a", {"data-hook":"review-title"}).get_text()
                    review_star_rating = review.find("i", {"data-hook": "review-star-rating"}).get_text()
                    review_author = review.find("a", {"data-hook": "review-author"}).get_text()
                    review_date = review.find("span", {"data-hook": "review-date"}).get_text()
                    review_body = review.find("span", {"data-hook": "review-body"}).get_text()
                    page_rank = "page" + str(page) + "-" + str(review_index + 1)
                    profile_url_part = review.find("a", {"data-hook": "review-author"})['href']
                    profile_url = "https://www.amazon.com" + profile_url_part

                    review_bage = ""
                    try:
                        review_bage = review.find("span", {"data-hook": "avp-badge"}).get_text()
                    except:
                        pass

                    review_variation = ""
                    try:
                        review_variation = review.find("a", {"data-hook": "format-strip"}).get_text()
                    except:
                        pass

                    review_dict = { "page_rank": page_rank,
                                    "asin": asin,
                                    "review_bage": review_bage,
                                    "review_variation": review_variation,
                                    "review_title": review_title,
                                    "review_star_rating": review_star_rating,
                                    "review_author": review_author,
                                    "review_date": review_date,
                                    "review_body": review_body,
                                    "profile_url": profile_url,
                                   }
                    print(review_dict)
                    self.review_dict_list.append(review_dict)
                self.dict_list_to_csv_file()
            except:
                pass
    def asin_to_listing_info(self):
        print("asin: ", self.asin)
        url = "https://www.amazon.com/dp/" + self.asin
        soup = amazon_module.download_soup_by_url(url)

        brand = " "
        try:
            if soup.find(id="bylineInfo"):
                brand = soup.find(id="bylineInfo").get_text().strip()
            if soup.find(id="brand"):
                brand = soup.find(id="brand").get_text().strip()
        except:
            pass
        print("brand:", brand)

        badge = " "
        try:
            if soup.find(id="acBadge_feature_div").find(
                    "div", class_="ac-badge-wrapper"):
                badge = " ".join(
                    soup.find(id="acBadge_feature_div").find(
                        "div",
                        class_="ac-badge-wrapper").get_text().strip().split())
                badge = badge.replace(
                    "Amazon's Choice recommends highly rated, well-priced products available to ship immediately. ",
                    "", 1)
        except:
            pass
        print("badge:", badge)

        title = " "
        try:
            if soup.find(id="productTitle"):
                title = soup.find(id="productTitle").get_text().strip()
        except:
            pass
        print("title:", title)

        variation_name = " "
        try:
            if soup.find(id="variation_pattern_name"):
                variation_name = soup.find(id="variation_pattern_name").find(
                    "span").get_text().strip()
                print("variation_pattern_name: ", variation_name)
            elif soup.find(id="variation_color_name"):
                variation_name = soup.find(
                    id="variation_color_name").find("span").get_text().strip()
                print("variation_color_name: ", variation_name)
            elif soup.find(id="variation_size_name"):
                variation_name = soup.find(
                    id="variation_size_name").find("span").get_text().strip()
                print("variation_size_name: ", variation_name)
            else:
                print("variation_name: ", variation_name)
            variation_name = " ".join(variation_name.split())
        except:
            pass

        price = " "
        try:
            if soup.find(id="price"):
                price = soup.find(id="price").find("span").get_text().strip()
            if soup.find(id="priceblock_ourprice"):
                price = soup.find(id="priceblock_ourprice").get_text().strip()
        except:
            pass
        print("price:", price)

        sold_by = " "
        try:
            if soup.find(id="merchant-info"):
                sold_by = " ".join(
                    soup.find(id="merchant-info").get_text().strip().split())
        except:
            pass
        print("sold_by:", sold_by)

        how_many_sellers = " "
        try:
            if soup.find(id="olp_feature_div"):
                how_many_sellers = soup.find(
                    id="olp_feature_div").find("a").get_text().strip()
        except:
            pass
        print("how_many_sellers:", how_many_sellers)

        bullets_list = []
        try:
            if soup.find("div", id="feature-bullets"):
                bullets_contents = soup.find(
                    "div", id="feature-bullets").find_all("span",
                                                          class_="a-list-item")
                for bullets_content in bullets_contents:
                    print(bullets_content.get_text().strip())
                    #toys
                    if bullets_content.span:
                        continue
                    bullets_list.append(bullets_content.get_text().strip())
                    bullets = bullets_list

                    bullet_1 = " "
                    bullet_2 = " "
                    bullet_3 = " "
                    bullet_4 = " "
                    bullet_5 = " "
                    bullet_6 = " "
                    bullet_7 = " "
                    bullet_8 = " "
                    bullet_9 = " "
                    bullet_10 = " "
                    if bullets:
                        try:
                            bullet_1 = bullets[0]
                        except:
                            pass
                        try:
                            bullet_2 = bullets[1]
                        except:
                            pass
                        try:
                            bullet_3 = bullets[2]
                        except:
                            pass
                        try:
                            bullet_4 = bullets[3]
                        except:
                            pass
                        try:
                            bullet_5 = bullets[4]
                        except:
                            pass
                        try:
                            bullet_6 = bullets[5]
                        except:
                            pass
                        try:
                            bullet_7 = bullets[6]
                        except:
                            pass
                        try:
                            bullet_8 = bullets[7]
                        except:
                            pass
                        try:
                            bullet_9 = bullets[8]
                        except:
                            pass
                        try:
                            bullet_10 = bullets[9]
                        except:
                            pass
        except:
            pass
        print("bullets_list:", bullets_list)

        a_plus_page = " "
        try:
            if soup.find(id="aplus"):
                a_plus_page = soup.find(id="aplus").get_text()
            a_plus_page = " ".join(a_plus_page.split())
        except:
            pass
        a_plus_page = re.sub(r"(Product Description.*; } )", "", a_plus_page)
        a_plus_page = re.sub(r"(From the manufacturer.*; } )", "", a_plus_page)
        a_plus_page = a_plus_page.replace("View larger ", "")
        a_plus_page = a_plus_page.replace("Read more ", "")
        print("a_plus_page:", a_plus_page)

        description = " "
        try:
            if soup.find(id="productDescription"):
                description = soup.find(id="productDescription").get_text()
            description = " ".join(description.split())
        except:
            pass
        description = re.sub(r"(Product Description.*; } )", "", description)
        description = re.sub(r"(From the manufacturer.*; } )", "", description)
        description = description.replace("View larger ", "")
        description = description.replace("Read more ", "")
        print("description:", description)

        salesrank = " "
        try:
            if soup.find(id="SalesRank"):
                salesrank = soup.find(id="SalesRank")
                salesrank = salesrank.get_text().strip()
                salesrank = re.search('#(\d|,)+', salesrank)
                salesrank = salesrank.group()
                salesrank = salesrank.replace(',', '')
                salesrank = salesrank.replace('#', '')
            #toys
            if soup.find(id="productDetails_detailBullets_sections1"):
                trs = soup.find(
                    id="productDetails_detailBullets_sections1").find_all("tr")
                for tr in trs:
                    if tr.find("th").get_text().strip():
                        if tr.find("th").get_text().strip(
                        ) == "Best Sellers Rank":
                            salesrank = tr.find("td").get_text().strip()
                            salesrank = re.search('#(\d|,)+', salesrank)
                            salesrank = salesrank.group()
                            salesrank = salesrank.replace(',', '')
                            salesrank = salesrank.replace('#', '')
        except:
            pass
        print("salesrank:", salesrank)

        review_num = " "
        try:
            if soup.find(id="acrCustomerReviewText"):
                review_num = soup.find(
                    id="acrCustomerReviewText").get_text().split()[0].strip()
        except:
            pass
        print("review_num:", review_num)

        review_value = " "
        try:
            if soup.find(class_="arp-rating-out-of-text"):
                review_value = soup.find(
                    class_="arp-rating-out-of-text").get_text().strip()
                review_value = re.search('(.*?)\s', review_value)
                review_value = review_value.group()
                review_value = review_value.strip()
        except:
            pass
        print("review_value:", review_value)

        qa_num = " "
        try:
            if soup.find(id="askATFLink"):
                qa_num = soup.find(
                    id="askATFLink").get_text().split()[0].strip()
        except:
            pass
        print("qa_num:", qa_num)

        picture_url = " "
        try:
            picture_urls_dict = dict()
            if soup.find("img", id="landingImage"):
                picture_urls = soup.find(
                    "img", id="landingImage")["data-a-dynamic-image"]
                picture_urls_dict = eval(picture_urls)
            picture_urls_list = []
            for key in picture_urls_dict.keys():
                picture_urls_list.append(key)
            picture_url = picture_urls_list[0]
        except:
            pass
        print("picture_url:", picture_url)
        self.picture_url = picture_url
        self.download_picture_by_url()

        self.listing_info_dict = {
            "asin": self.asin,
            "url": url,
            "brand": brand,
            "best_seller_badge": self.best_seller_badge,
            "badge": badge,
            "title": title,
            "variation_name": variation_name,
            "price": price,
            "sold_by": sold_by,
            "how_many_sellers": how_many_sellers,
            "bullet_1": bullet_1,
            "bullet_2": bullet_2,
            "bullet_3": bullet_3,
            "bullet_4": bullet_4,
            "bullet_5": bullet_5,
            "bullet_6": bullet_6,
            "bullet_7": bullet_7,
            "bullet_8": bullet_8,
            "bullet_9": bullet_9,
            "bullet_10": bullet_10,
            "a_plus_page": a_plus_page,
            "description": description,
            "salesrank": salesrank,
            "review_num": review_num,
            "review_value": review_value,
            "qa_num": qa_num,
            "picture_url": picture_url
        }

        return self.listing_info_dict
        with open(img_path, 'wb') as fp:
            fp.write(pic.content)
        print("SUCCESS to download picture")
    except requests.exceptions.ConnectionError:
        print("FAIL to download picture!")


# main
start_datetime = datetime.now()
print("start_datetime:", start_datetime)
csv_file_name = str(start_datetime).replace(":",
                                            ";").strip().split(".")[0] + ".csv"

for asin in asin_list:
    listing_url = "https://www.amazon.com/dp/" + asin
    soup = amazon_module.download_soup_by_url(listing_url)
    start_review_url_part2 = soup.find(id="dp-summary-see-all-reviews")["href"]
    start_review_url = "https://www.amazon.com" + start_review_url_part2

    location = re.search("ref=", start_review_url)
    span = location.span()[0]
    start_review_url_part1 = start_review_url[:span]
    # https://www.amazon.com/Stainless-Steel-Personalized-Tags-Lines/product-reviews/B00BJLS55G/ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=all_reviews&pageNumber=1&sortBy=helpful&filterByStar=five_star&mediaType=media_reviews_only
    review_base_url = start_review_url_part1 + "ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=" + all_or_VP + "&sortBy=" + top_or_recent + "&filterByStar=" + stars + "&mediaType=" + all_or_media + "&pageNumber="
    first_review_url = review_base_url + str(1)

    for page in range(1, max_page + 1):
        review_url = review_base_url + str(page)
        try:
            dict_list = []
            soup = amazon_module.download_soup_by_url(review_url)
from bs4 import BeautifulSoup
from amazon_module import amazon_module
import re

# proxy
# SOCKS5 proxy for HTTP/HTTPS
proxies = {
    'http': 'http://192.168.1.103:48850',
    'https': 'https://192.168.1.103:48850',
}

# headers
headers = {}

url = 'https://www.amazon.com/dp/B07B8CMX26'
soup = amazon_module.download_soup_by_url(url)
# results = open('./product-detail-page-one.html', 'r')
# r = results.read()

# soup = BeautifulSoup(r, 'html.parser')
try:
    # Variations
    try:
        lis = soup.find("ul", role="radiogroup").find_all("li")
        variation_list = []
        for li in lis:
            variation_list.append(li['data-defaultasin'])
    except Exception as e:
        print("Analyze Variations Failed!: {}".format(e))
        pass
Beispiel #22
0
def asin_to_listing_info(asin, index):

    print("asin: ", asin)

    url = "https://www.amazon.com/dp/" + asin
    print("url: ", url)

    soup = amazon_module.download_soup_by_url(url)

    brand = " "
    try:
        if soup.find(id="bylineInfo"):
            brand = soup.find(id="bylineInfo").get_text().strip()
        if soup.find(id="brand"):
            brand = soup.find(id="brand").get_text().strip()
    except:
        pass
    print("brand: ", brand)

    badge = " "
    try:
        if soup.find("a", class_="badge-link"):
            badge = " ".join(
                soup.find("a", class_="badge-link").get_text().strip().split())
    except:
        pass
    print("badge: ", badge)

    title = " "
    try:
        if soup.find(id="productTitle"):
            title = soup.find(id="productTitle").get_text().strip()
    except:
        pass
    print("title: ", title)

    variation_name = " "
    try:
        if soup.find(id="variation_pattern_name"):
            variation_name = soup.find(
                id="variation_pattern_name").find("span").get_text().strip()
            print("variation_pattern_name: ", variation_name)
        elif soup.find(id="variation_color_name"):
            variation_name = soup.find(
                id="variation_color_name").find("span").get_text().strip()
            print("variation_color_name: ", variation_name)
        elif soup.find(id="variation_size_name"):
            variation_name = soup.find(
                id="variation_size_name").find("span").get_text().strip()
            print("variation_size_name: ", variation_name)
        else:
            print("variation_name: ", variation_name)
    except:
        pass

    price = " "
    sale_price = " "
    try:
        if soup.find(id="price"):
            price = soup.find(id="price").find("span").get_text().strip()
        if soup.find(id="priceblock_ourprice"):
            price = soup.find(id="priceblock_ourprice").get_text().strip()
        if soup.find(id="priceblock_saleprice"):
            sale_price = soup.find(
                id="priceblock_saleprice").get_text().strip()
    except:
        pass
    print("price: ", price)
    print("sale_price: ", sale_price)

    sold_by = " "
    try:
        if soup.find(id="merchant-info"):
            # print("soup.find(id='merchant-info').get_text().strip(): ", soup.find(id="merchant-info").get_text().strip())
            sold_by = " ".join(
                soup.find(id="merchant-info").get_text().strip().split())
    except:
        pass
    print("sold_by: ", sold_by)

    how_many_sellers = " "
    try:
        if soup.find(id="olp_feature_div"):
            how_many_sellers = soup.find(
                id="olp_feature_div").find("a").get_text().strip()
    except:
        pass
    print("how_many_sellers: ", how_many_sellers)

    bullets_list = []
    bullet_1 = " "
    bullet_2 = " "
    bullet_3 = " "
    bullet_4 = " "
    bullet_5 = " "
    try:
        if soup.find("div", id="feature-bullets"):
            bullets_contents = soup.find("div", id="feature-bullets").find_all(
                "span", class_="a-list-item")
            # print("bullets: ")
            for bullets_content in bullets_contents:
                # print(bullets_content.get_text().strip())
                #toys
                if bullets_content.span:
                    continue
                bullets_list.append(bullets_content.get_text().strip())
    except:
        pass
    try:
        bullet_1 = bullets_list[0]
        bullet_2 = bullets_list[1]
        bullet_3 = bullets_list[2]
        bullet_4 = bullets_list[3]
        bullet_5 = bullets_list[4]
    except:
        pass
    print("bullet_1: ", bullet_1)
    print("bullet_2: ", bullet_2)
    print("bullet_3: ", bullet_3)
    print("bullet_4: ", bullet_4)
    print("bullet_5: ", bullet_5)

    description = " "
    try:
        if soup.find(id="productDescription"):
            description = soup.find(id="productDescription").get_text()
        if soup.find(id="aplus"):
            description = soup.find(
                id="aplus").find("div").find_all("div").get_text()
            description = re.search(r".aplus-v2(.*)\}(.*)", description)
            description = description.group(1)
            description = description.strip()
        description = " ".join(description.split())
    except:
        pass
    print("description: ", description)

    salesrank = " "
    # try:
    #     if soup.find(id="SalesRank"):
    #         salesrank = soup.find(id="SalesRank")
    #         salesrank = salesrank.get_text().strip()
    #         salesrank = re.search('#(\d|,)+', salesrank)
    #         salesrank = salesrank.group()
    #         salesrank = salesrank.replace(',', '')
    #         salesrank = salesrank.replace('#', '')
    #     #toys
    #     if soup.find(id="productDetails_detailBullets_sections1"):
    #         trs = soup.find(id="productDetails_detailBullets_sections1").find_all("tr")
    #         for tr in trs:
    #             if tr.find("th").get_text().strip():
    #                 if tr.find("th").get_text().strip() == "Best Sellers Rank":
    #                     salesrank = tr.find("td").get_text().strip()
    #                     salesrank = re.search('#(\d|,)+', salesrank)
    #                     salesrank = salesrank.group()
    #                     salesrank = salesrank.replace(',', '')
    #                     salesrank = salesrank.replace('#', '')
    # except:
    #     pass
    # print("salesrank: ", salesrank)

    salesrank_1 = " "
    salesrank_2 = " "
    salesrank_3 = " "
    salesrank_node_1 = " "
    salesrank_node_2 = " "
    salesrank_node_3 = " "
    try:
        salesrank_1 = soup.find(id="SalesRank")
        salesrank_1 = salesrank_1.get_text().strip()
        salesrank_1 = re.search('#(\d|,)+', salesrank_1)
        salesrank_1 = salesrank_1.group()
        salesrank_1 = salesrank_1.replace(',', '')
        salesrank_1 = salesrank_1.replace('#', '')
        # print(salesrank_1)
        salesrank_node_1 = soup.find(id="SalesRank")
        salesrank_node_1 = salesrank_node_1.get_text().strip()
        salesrank_node_1 = re.search(r"in(.*?)\(", salesrank_node_1)
        salesrank_node_1 = salesrank_node_1.group()
        salesrank_node_1 = salesrank_node_1.replace("in ", "")
        salesrank_node_1 = salesrank_node_1.replace(" (", "")
        salesrank_node_1 = salesrank_node_1.strip()
        # print(salesrank_node_1)

        try:
            lis = soup.find(id="SalesRank").find(
                "ul", class_="zg_hrsr").find_all("li")
            node_salesrank_list = []
            node_name_list = []
            for li in lis:
                node_salesrank = li.get_text().strip()
                node_salesrank = re.search('#(\d|,)+', node_salesrank)
                node_salesrank = node_salesrank.group()
                node_salesrank = node_salesrank.replace(',', '')
                node_salesrank = node_salesrank.replace('#', '')
                node_salesrank_list.append(node_salesrank)

                node_name = li.get_text().strip()
                node_name = re.search(r"in(.*)", node_name)
                node_name = node_name.group()
                node_name = node_name.replace("in\xa0", "")
                node_name = node_name.strip()
                node_name_list.append(node_name)
            # print(node_salesrank_list)
            # print(node_name_list)

            if len(node_salesrank_list) == 1:
                salesrank_2 = node_salesrank_list[0]
            if len(node_salesrank_list) == 2:
                salesrank_2 = node_salesrank_list[0]
                salesrank_3 = node_salesrank_list[1]

            if len(node_salesrank_list) == 1:
                salesrank_node_2 = node_name_list[0]
            if len(node_salesrank_list) == 2:
                salesrank_node_2 = node_name_list[0]
                salesrank_node_3 = node_name_list[1]
        except:
            pass
    except:
        pass
    print("salesrank_1: ", salesrank_1, " ", "salesrank_node_1: ",
          salesrank_node_1)
    print("salesrank_2: ", salesrank_2, " ", "salesrank_node_2: ",
          salesrank_node_2)
    print("salesrank_3: ", salesrank_3, " ", "salesrank_node_3: ",
          salesrank_node_3)

    review_num = " "
    try:
        if soup.find(id="acrCustomerReviewText"):
            review_num = soup.find(
                id="acrCustomerReviewText").get_text().split()[0].strip()
    except:
        pass
    print("review_num: ", review_num)

    review_value = " "
    try:
        if soup.find(class_="arp-rating-out-of-text"):
            review_value = soup.find(
                class_="arp-rating-out-of-text").get_text().strip()
            review_value = re.search('(.*?)\s', review_value)
            review_value = review_value.group()
            review_value = review_value.strip()
    except:
        pass
    print("review_value: ", review_value)

    qa_num = " "
    try:
        if soup.find(id="askATFLink"):
            qa_num = soup.find(id="askATFLink").get_text().split()[0].strip()
    except:
        pass
    print("qa_num: ", qa_num)

    picture_url = " "
    try:
        picture_urls_dict = dict()
        if soup.find("img", id="landingImage"):
            picture_urls = soup.find("img",
                                     id="landingImage")["data-a-dynamic-image"]
            picture_urls_dict = eval(picture_urls)
        picture_urls_list = []
        for key in picture_urls_dict.keys():
            picture_urls_list.append(key)
        picture_url = picture_urls_list[0]
    except:
        pass
    print("picture_url: ", picture_url)

    listing_info_dict = {
        "asin": asin,
        "url": url,
        "brand": brand,
        "badge": badge,
        "title": title,
        "variation_name": variation_name,
        "price": price,
        "sale_price": sale_price,
        "sold_by": sold_by,
        "how_many_sellers": how_many_sellers,
        # "bullets": bullets_list,
        "bullet_1": bullet_1,
        "bullet_2": bullet_2,
        "bullet_3": bullet_3,
        "bullet_4": bullet_4,
        "bullet_5": bullet_5,
        "description": description,
        # "salesrank": salesrank,
        "salesrank_1": salesrank_1,
        "salesrank_node_1": salesrank_node_1,
        "salesrank_2": salesrank_2,
        "salesrank_node_2": salesrank_node_2,
        "salesrank_3": salesrank_3,
        "salesrank_node_3": salesrank_node_3,
        "review_num": review_num,
        "review_value": review_value,
        "qa_num": qa_num,
        "picture_url": picture_url
    }

    # return listing_info_dict
    dict_list_to_csv_file(listing_info_dict, index)

    try:
        download_picture_by_url(asin, picture_url)
    except:
        pass
Beispiel #23
0
# ! /usr/bin/env python
# -*- coding:utf-8 -*-
from amazon_module import amazon_module
import re
import os
from bs4 import BeautifulSoup
import requests
import csv
import openpyxl

node_dict = {}
node_url_dict = {}

top100_url = "https://www.amazon.com/Best-Sellers/zgbs/ref=zg_bs_unv_e_0_e_1"
soup = amazon_module.download_soup_by_url(top100_url)

lis = soup.find(id="zg_browseRoot").find("ul").find_all("li")

# for temp_li_index, temp_li in enumerate(lis):
#     print(temp_li_index, temp_li.get_text())

# 0 Amazon Devices & Accessories
# 1 Amazon Launchpad
# 2 Appliances
# 3 Apps & Games
# 4 Arts, Crafts & Sewing
# 5 Automotive
# 6 Baby
# 7 Beauty & Personal Care
# 8 Books
# 9 CDs & Vinyl
Beispiel #24
0
def asin_to_listing_info(asin, country=None):
    now = int(time.time())
    # print("asin: ", asin)
    url = "https://www.amazon.com/dp/" + asin
    if country:
        url = "https://www.amazon.co.uk/dp/" + asin
    soup = amazon_module.download_soup_by_url(url)

    brand = " "
    brand_url = ""
    try:
        if soup.find(id="bylineInfo"):
            brand = soup.find(id="bylineInfo").get_text().strip()
            brand_url = soup.find(id="bylineInfo")["href"]
        if soup.find(id="brand"):
            brand = soup.find(id="brand").get_text().strip()
    except:
        pass

    title = ""
    try:
        if soup.find(id="productTitle"):
            title = soup.find(id="productTitle").get_text().strip()
    except:
        pass

    variation_name = " "
    try:
        if soup.find(id="variation_pattern_name"):
            variation_name = soup.find(
                id="variation_pattern_name").find("span").get_text().strip()
        elif soup.find(id="variation_color_name"):
            variation_name = soup.find(
                id="variation_color_name").find("span").get_text().strip()
        elif soup.find(id="variation_size_name"):
            variation_name = soup.find(
                id="variation_size_name").find("span").get_text().strip()
    except:
        pass

    price = 0.0
    try:
        if soup.find(id="price"):
            price = soup.find(id="price").find("span").get_text()
            price = re.search('(\d*\.\d*)', price)
            price = price.group()
        if soup.find(id="priceblock_ourprice"):
            price = soup.find(id="priceblock_ourprice").get_text()
            price = re.search('(\d*\.\d*)', price)
            price = price.group()
    except:
        pass

    sold_by = " "
    try:
        if soup.find(id="merchant-info"):
            sold_by = " ".join(
                soup.find(id="merchant-info").get_text().strip().split())
    except:
        pass

    availability = ""
    try:
        if soup.find(id="availability"):
            availability = soup.find(
                id="availability").find("span").get_text().strip()
    except:
        pass

    aplus = ""
    try:
        if soup.find(id="aplus"):
            aplus = soup.find(id="aplus").find("h2").get_text().strip()
    except:
        pass

    ranking_list = []
    offering_list = []
    spans_text = ""

    review_dict_list = []
    review_last_desc = ""
    review_last_time = 0
    review_last_unit = ""

    # Salesrank
    try:
        trs = soup.find(id="productDetails_detailBullets_sections1")
        if trs:
            trs = trs.find_all("tr")
            for tr in trs:
                try:
                    th = tr.find("th").get_text().strip()
                    if th == "Best Sellers Rank":
                        spans = tr.find("span").find_all("span")
                        num = 0
                        for span in spans:
                            try:
                                span_text = span.get_text()
                                spans_text = spans_text + span_text + "\n"
                                ranking = re.search('#(\d|,)+', span_text)
                                ranking = ranking.group()
                                ranking = ranking.replace(',', '')
                                ranking = ranking.replace('#', '')

                                rank_text_arr = span_text.split(' in ')
                                rank_text = rank_text_arr[1]
                                rank_text_arr = rank_text.split('(')
                                rank_text = rank_text_arr[0]

                                num = num + 1
                                rank_dict = {
                                    "detail_id": now,
                                    "rank_num": num,
                                    "rank_asin": asin,
                                    "rank_order": ranking,
                                    "rank_text": rank_text,
                                }
                                ranking_list.append(rank_dict)
                            except Exception as e:
                                print("Handling Salesrank string errors !: {}".
                                      format(e))
                                pass

                except Exception as e:
                    print("Analyze Salesrank th errors!: {}".format(e))
                    pass
    except Exception as e:
        print("Analyze Salesrank errors!: {}".format(e))
        pass

    review_num = 0
    try:
        if soup.find(id="acrCustomerReviewText"):
            review_num = soup.find(id="acrCustomerReviewText").get_text(
            ).split()[0].strip(",").strip().replace(',', '')
    except:
        pass

    review_value = 0.0
    try:
        if soup.find(class_="arp-rating-out-of-text"):
            review_value = soup.find(
                class_="arp-rating-out-of-text").get_text().strip()
            review_value = re.search('(.*?)\s', review_value)
            review_value = review_value.group()
            review_value = review_value.strip()
    except:
        pass

    qa_num = 0
    try:
        if soup.find(id="askATFLink"):
            qa_num = soup.find(id="askATFLink").get_text().split()[0].strip()
    except:
        pass

    try:
        review_list = soup.find(id="most-recent-reviews-content")
        if review_list and review_list.find_all(
                "div", {"data-hook": "recent-review"}):
            review_list = review_list.find_all("div",
                                               {"data-hook": "recent-review"})
            for review_index, review in enumerate(review_list):
                review_title = review.find("span", {
                    "data-hook": "review-title-recent"
                }).get_text()
                review_star_rating = review.find(
                    "i", {
                        "data-hook": "review-star-rating-recent"
                    }).get_text()
                review_author_url = review.find("a",
                                                {"class": "a-profile"})["href"]
                review_author = review.find("span", {
                    "class": "a-profile-name"
                }).get_text()
                review_date_desc = review.find(
                    "span", {
                        "data-hook": "review-author-timestamp"
                    }).get_text()
                review_body = review.find("span", {
                    "data-hook": "review-body-recent"
                }).get_text()
                review_date_desc_temp = review_date_desc.lstrip(
                    'Published ').rstrip(' ago')
                if 'on ' in review_date_desc_temp:
                    review_date = '2'
                    review_date_unit = 'year'
                else:
                    review_date_desc_arr = review_date_desc_temp.split(' ')
                    review_date = review_date_desc_arr[0]
                    review_date_unit = review_date_desc_arr[1].rstrip('s')
                #TODO:
                if review_index == 0:
                    review_last_desc = review_date_desc
                    review_last_time = review_date
                    review_last_unit = review_date_unit
                review_dict = {
                    "review_asin": asin,
                    "review_title": review_title,
                    "review_star":
                    review_star_rating.rstrip(" out of 5 stars"),
                    "review_author": review_author,
                    "review_author_url": review_author_url,
                    "review_date": review_date,
                    "review_date_unit": review_date_unit,
                    "review_date_desc": review_date_desc,
                    "review_body": review_body,
                }
                review_dict_list.append(review_dict)
    except Exception as e:
        print "analyze review errors:{}".format(e)
        pass

    # follow_sell
    how_many_sellers = ""
    follow_type = ""
    follow_num = 0
    buy_money = 0.0

    try:
        olp_feature_div = soup.find(id="olp_feature_div")
        if olp_feature_div and olp_feature_div.find("a"):
            how_many_sellers = olp_feature_div.find("a").get_text().strip()
            if country:
                follow_sell = how_many_sellers.split()
                follow_num = follow_sell[0].strip()
                follow_type = follow_sell[1].strip()
            else:
                follow_sell = how_many_sellers.split('(')
                follow_type = follow_sell[0].strip()
                follow_num = follow_sell[1].split(')')
                buy_money = follow_num[1].split('$')
                buy_money = buy_money[1].strip().replace(',', '').rstrip(' +')
                follow_num = follow_num[0].strip()

    except Exception as e:
        print("Handling follow_sell errors !: {}".format(e))
        pass

    listing_info_dict = {
        "id": now,
        "asin": asin,
        "url": url,
        "brand": brand,
        "brand_url": brand_url,
        "title": title,
        "variation_name": variation_name,
        "availability": availability,
        "price": price,
        "sold_by": sold_by,
        "how_many_sellers": how_many_sellers,
        "follow_type": follow_type,
        "follow_num": follow_num,
        "buy_money": buy_money,
        "review_num": review_num,
        "review_value": review_value,
        "review_last_time": review_last_time,
        "review_last_unit": review_last_unit,
        "review_last_desc": review_last_desc,
        "spans_text": spans_text,
        "qa_num": qa_num,
        "aplus": aplus,
    }

    return listing_info_dict, ranking_list, offering_list, review_dict_list