def get_shop_info(self): headers = GlobalTools.getHeaders() res = requests.get(self.link, headers=headers) html = BeautifulSoup(res.text, 'lxml') print html.find(id="sellerName").text feedback = html.find(id="feedback-summary-table") feedbacktab = feedback.find_all("tr") timescoop = feedbacktab[0].find_all("th") for item in timescoop: print item.text positive = feedbacktab[1].find_all("td") neutral = feedbacktab[2].find_all("td") negtive = feedbacktab[3].find_all("td") count = feedbacktab[4].find_all("td") for feedback in feedbacktab: line = feedback.find_all("td") for item in line: print item.text.strip("\n").strip() + ",", print products = html.find(id="product-data").find_all( attrs={'class': "product-details"}) for product in products: titlelink = product.find('a', attrs={'class': "product-title"}) title = titlelink.get('title') href = titlelink.get('href') price = product.find('div', attrs={'class': 'product-price'}) ranting = product.find('div', class_="product-rating") print title + " " + href + " " + price + " " + ranting + " "
def get_following_by_asin(asin,baseurl): headers = GlobalTools.getHeaders() url = baseurl+"/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords="+str(asin) print "get url:"+url res = requests.get(url,headers=headers) # print "res:headers:" # print res.headers if res.headers['Content-Encoding'] == "br": html = BeautifulSoup(brotli.decompress(res.content),"lxml") else: html = BeautifulSoup(res.content,"lxml") tmp = open("tmp2.html","w+") if res.headers['Content-Encoding'] == "br": tmp.write(brotli.decompress(res.content)) else: tmp.write(res.content) tmp.close() link = (html.find(id="s-results-list-atf")).find('a',attrs={'class':'s-access-detail-page'}) links = (html.find(id="s-results-list-atf")).find_all('a') target = "" for link in links: if 'offer-listing' in link['href']: # print link.text.strip().split('(') if int(link.text.strip().split('(')[1].split()[0]) > 1: return [True,link['href']] else: return [False] else: return [False]
def get_profile_info(self, url): "https://www.amazon.co.uk/gp/profile/amzn1.account.AHTYNWDHL6M2WCVS7LOUVFXBLLFQ/ref=cm_cr_getr_d_gw_btm?ie=UTF8" profileid = url.split("profile/")[1].split("/ref")[0] url = "https://www.amazon.co.uk/profilewidget/bio/" + profileid + "?view=visitor" print("in get_profile_info") headers = GlobalTools.getHeaders() headers[ 'Cookie'] = 's_nr=1507517187408-New; s_vnum=1939517187408%26vn%3D1; s_dslv=1507517187409; x-acbde="6gxYYwpBpG20FBChzzu9sn?hypH9MpwKF0gVmk2LOxnYWw2uE@5B3Qh7Df?gkrXM"; at-acbde=Atza|IwEBIFPo-tRvBxygSgF8Ard63lJANpi78TG-8BUTC8ScSLLiUskUDIh0VMUwG_l8fsWqij5ArfksGmp6Ks52ZiYPS0bJeoDkACAtCZF6h3ePo0yqw9jdKVsq4edrTZPfLFYYYaRsbNyD2x09klSn7jKaU8Sn56Cr4VCIx_H8LObqLF2bX6Aq0EWW-O0PoBHgkdYI9iPhMo_2OHQjWuFAeinw0dU1M7X-SWBl2wB4FtzVXlQzarbwLjsHxXSaw2LwX3ENF6oCHOh73pPPnTX68JEedEkLu-sOSL2eZ5Whe7zJ2L76yyEzyjVXQpWbDdUqUP58MdLTNLfhCM5LkwWGmd7fuoLC1u7sZhBkJSA6oLQ0Q3kua5e8x0LfI3HfLZwC6qzrDJ6pheW0my98MFK4r9JaG85Z; sess-at-acbde="d7DXrZglD8+7+42k5qmlfFUxSpHJkUg8H1Dz17ZCU+U="; x-wl-uid=1WLJUGaYF93xUQuJRK3PCgsu0IJeaJoL7J/7XRaD4Men7E4FPUEro4vxW+rjyvLb9XCGGKFNM1yrtwZ9b9BK3yXkMKCav41q6XBiaxBqGmVWG1vMYfNxoP30XR5Otq5GKr5uenX7TA98=; session-token="1o+pNqOm6F7uZWrYdtDbU26LiB8ByJ40B64c+JFwPh3lkBt1MbUn+ha6qR3BaTgduMMVK1e1LjJ6pnoF+/r3c4PUBDfax7J+AGcgt2QiXkvMdVyLjyDowIQtWUbeHi6V4hfxIhgrYGcAyZ4x4keQvPaEHOW0v8t8akQV0nmi5sj1Jzu8pn162bmTw0XLP88olTMWGCWAeJlHGsXpCvyiS1VrFGHpgj2xSW3j5jdNi8DCjE4R7E+EqR+4BNFVQs+1KUR7bf9qBMWu3xT7DDe9KQ=="; session-id-time=2082754801l; session-id=261-5557160-1959728; ubid-acbde=258-5984155-0914160; csm-hit=0CP5W9ZYZNFE06XFCV0V+b-9KE07PDF4YD27JB8DFQQ|1509967781417' res = requests.get(url, headers=headers) # html = GlobalTools.getResponseContent(res) # htmltxt = html.text # s = htmltxt.split("window.CustomerProfileRootProps")[1].split("window.PageContext")[0].replace("=","").replace(";","").strip() # print(s) # s = s.decode(encoding="utf-8") s = res.text try: jsonobj = json.loads(s) except: tmp = open("tmp2.html", "w+") tmp.write(s) tmp.close() exit(1) # name = jsonobj["nameHeaderData"]["name"] # print("name:"+name) reviewRank = jsonobj['topReviewerInfo']['rank'] print(reviewRank)
def prerequest(self): queue = self.queue queue.put("prerequest") print("prerequest") GlobalTools.setbaseurl(self.baseurl) res = requests.get(self.url, headers=self.headers) self.res = res html = GlobalTools.getResponseContent(self.res) if html.find(id="add-to-cart-button") is None: if html.find(id="availability") is not None: # print "text" + html.find(id="availability").text url = self.baseurl + html.find( id="availability").find("a").get('href') self.second_url = url res = requests.get(url, headers=GlobalTools.getHeaders()) html = GlobalTools.getResponseContent(res) try: price = html.find(class_="olpOfferPrice").text.strip() self.unnormal_price = price print(price) shop = html.find(class_="olpSellerName").text self.unnormal_shop = shop print(shop) except: traceback.print_exc() self.normal_situation = False return False return True
def get_product_images(self): for img in self.imgurls: res = requests.get(img, headers=GlobalTools.getHeaders()) filename = self.asin + "_" + self.url.split("/")[-1] with (open(filename, "wb")) as f: f.write(res.content) f.close()
def get_email(self, url): print("******************") print("url:" + url) headers = GlobalTools.getHeaders() headers['X-Requested-With'] = 'XMLHttpRequest' headers[ 'Referer'] = 'https://www.amazon.de/gp/profile/amzn1.account.AF3BW3DYKKEHMR4HSAFIQDM62QNQ/ref=cm_cr_getr_d_pdp?ie=UTF8' headers[ 'Cookie'] = 's_nr=1507517187408-New; s_vnum=1939517187408%26vn%3D1; s_dslv=1507517187409; x-acbde="6gxYYwpBpG20FBChzzu9sn?hypH9MpwKF0gVmk2LOxnYWw2uE@5B3Qh7Df?gkrXM"; at-acbde=Atza|IwEBIFPo-tRvBxygSgF8Ard63lJANpi78TG-8BUTC8ScSLLiUskUDIh0VMUwG_l8fsWqij5ArfksGmp6Ks52ZiYPS0bJeoDkACAtCZF6h3ePo0yqw9jdKVsq4edrTZPfLFYYYaRsbNyD2x09klSn7jKaU8Sn56Cr4VCIx_H8LObqLF2bX6Aq0EWW-O0PoBHgkdYI9iPhMo_2OHQjWuFAeinw0dU1M7X-SWBl2wB4FtzVXlQzarbwLjsHxXSaw2LwX3ENF6oCHOh73pPPnTX68JEedEkLu-sOSL2eZ5Whe7zJ2L76yyEzyjVXQpWbDdUqUP58MdLTNLfhCM5LkwWGmd7fuoLC1u7sZhBkJSA6oLQ0Q3kua5e8x0LfI3HfLZwC6qzrDJ6pheW0my98MFK4r9JaG85Z; sess-at-acbde="d7DXrZglD8+7+42k5qmlfFUxSpHJkUg8H1Dz17ZCU+U="; x-wl-uid=1WLJUGaYF93xUQuJRK3PCgsu0IJeaJoL7J/7XRaD4Men7E4FPUEro4vxW+rjyvLb9XCGGKFNM1yrtwZ9b9BK3yXkMKCav41q6XBiaxBqGmVWG1vMYfNxoP30XR5Otq5GKr5uenX7TA98=; session-token="1o+pNqOm6F7uZWrYdtDbU26LiB8ByJ40B64c+JFwPh3lkBt1MbUn+ha6qR3BaTgduMMVK1e1LjJ6pnoF+/r3c4PUBDfax7J+AGcgt2QiXkvMdVyLjyDowIQtWUbeHi6V4hfxIhgrYGcAyZ4x4keQvPaEHOW0v8t8akQV0nmi5sj1Jzu8pn162bmTw0XLP88olTMWGCWAeJlHGsXpCvyiS1VrFGHpgj2xSW3j5jdNi8DCjE4R7E+EqR+4BNFVQs+1KUR7bf9qBMWu3xT7DDe9KQ=="; session-id-time=2082754801l; session-id=261-5557160-1959728; ubid-acbde=258-5984155-0914160; csm-hit=0CP5W9ZYZNFE06XFCV0V+b-9KE07PDF4YD27JB8DFQQ|1509967781417' res = requests.get(url, headers) print(res.status_code) html = GlobalTools.getResponseContent(res) print(html) print("******************")
def __init__(self, asin, positive_count, negtive_count, debug=False): self.debug = debug self.baseurl = GlobalTools.getbaseurl() self.asin = asin self.headers = GlobalTools.getHeaders() # self.positiveurl = self.baseurl+"/ss/customer-reviews/ajax/reviews/get/ref=cm_cr_arp_d_viewpnt_lft" self.positiveurl = self.baseurl + "/hz/reviews-render/ajax/reviews/get/ref=cm_cr_arp_d_viewpnt_lft" self.negtive_page_count = negtive_count / 4 + 1 self.positivecount = positive_count self.negtivecount = negtive_count self.positivevote = 0 self.negtivevote = 0 self.POSITIVE_VOTE_TYPE = 0 self.NEGTIVE_VOTE_TYPE = 1
def __init__(self, queue, asin, countrycode): self.queue = queue self.countrycode = countrycode self.baseurl = GlobalTools.getBaseurlFromCountrycode(countrycode) self.headers = GlobalTools.getHeaders() self.asin = asin self.url = get_link_by_asin(asin, self.baseurl) #if can't get a normal page ,can't use this kind of url to get a price and shop name #the second link look like this : http://www.amazon.de/gp/offer-listing/B01N52QW8A/ref=dp_olp_0?ie=UTF8&condition=all self.second_url = "" self.normal_situation = True self.unnormal_price = "" self.unnormal_shop = "" self.resultmap = {} self.result = [] self.us_reviews_need_adjust = False
def get_link_by_asin(asin, baseurl): # print "in get_link_by_asin" headers = GlobalTools.getHeaders() # baseurl = "http://www.amazon.co.uk" url = baseurl + "/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords=" + str( asin) # url = baseurl # print url # url:search - alias = aps # field - keywords:B01KGUMWJU params = { # "url":"search-alias=aps", # "field-keywords":asin } # proxies = { # "http":"123.148.74.107:80", # "https": "218.18.10.11:9797" # } print "get url:" + url res = requests.get(url, headers=headers) print "res:headers:" print res.headers if res.headers['Content-Encoding'] == "br": html = BeautifulSoup(brotli.decompress(res.content), "lxml") else: html = BeautifulSoup(res.content, "lxml") # html = BeautifulSoup(res.content, "lxml") tmp = open("tmp2.html", "w+") if res.headers['Content-Encoding'] == "br": tmp.write(brotli.decompress(res.content)) else: tmp.write(res.content) tmp.close() # print "url:"+url # print html.find(id="centerMinus") # link = html.find(id="s-results-list-atf") link = (html.find(id="s-results-list-atf")).find( 'a', attrs={'class': 's-access-detail-page'}) link = link.get('href') link = link.split("&qid")[0] print "link:" + link return link
def getFlowingList(url): res = requests.get(url,headers=GlobalTools.getHeaders()) if res.headers['Content-Encoding'] == "br": html = BeautifulSoup(brotli.decompress(res.content),"lxml") else: html = BeautifulSoup(res.content,"lxml") followerlist = html.find(id,"olpOfferList").find_all(class_="olpOffer") resultlist = [] for follow in followerlist: followerNameElem = follow.find(class_="olpSellerName") if len(followerNameElem.find_all("a"))>0: followerName = followerNameElem.text url = GlobalTools.getBaseurlFromCountrycode("uk")+(followerNameElem.find("a"))['href'] else: if len(followerNameElem.find_all("img"))>0: followerName = followerNameElem.find("img")['alt'] url = "https://amazon.com" else: followerName = "" url = "" print (followerName,url)
def get_link_by_asin(asin, baseurl): headers = GlobalTools.getHeaders() # url = baseurl+"/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords="+str(asin) url = baseurl + "/s?k=" + asin + "&ref=nb_sb_noss_2" print("get url:" + url) res = requests.get(url, headers=headers) # print("res:headers:") print(res.headers) if res.headers['Content-Encoding'] == "br": html = BeautifulSoup(brotli.decompress(res.content), "lxml") with open("searchasin.html", "w") as f: f.write(brotli.decompress(res.content).decode("utf-8")) else: html = BeautifulSoup(res.content, "lxml") with open("searchasin.html", "w") as f: f.write(res.content.decode("utf-8")) # tmp = open("tmp2.html","w") # if res.headers['Content-Encoding'] == "br": # tmp.write(brotli.decompress(res.content)) # else: # tmp.write(res.content.decode("utf-8")) # tmp.close() # # link = html.find_all(class_="s-search-results")[1].find_all('a',attrs={'class':'a-text-normal'})[0] # link = link.get('href') # link = link.split("&qid")[0] # print("link:"+baseurl+link) # # return baseurl+link # return baseurl + "/dp/" + asin +"/ref=redir_mobile_desktop" t = html.find_all(class_="s-search-results")[1] productslink = t.find_all("a") for item in productslink: if "/dp/" + asin in item.get('href'): return baseurl + (item.get('href').split("&qid")[0])
def getusviewcount(self): asin = self.asin url = "https://www.amazon.com/product-reviews/" + asin + "/ref=acr_dpx_see_all?ie=UTF8&showViewpoints=1" res = requests.get(url, headers=GlobalTools.getHeaders()) html = GlobalTools.getResponseContent(res) viewpoints = html.find_all(id=re.compile("viewpoint-")) if len(viewpoints) > 0: try: positive = viewpoints[0].find_all( attrs={"data-reftag": "cm_cr_arp_d_viewpnt_lft"})[0].text self.resultmap['positivereviewcount'] = int( positive.split("positive")[0].split("all")[1].strip()) except: pass if len(viewpoints) > 1: try: negtive = viewpoints[1].find_all( attrs={"data-reftag": "cm_cr_arp_d_viewpnt_rgt"})[0].text self.resultmap['negtivereviewcount'] = int( negtive.split("critical")[0].split("all")[1].strip()) except: pass print(viewpoints)
def __init__(self,seller): self.marketplaceid = GlobalTools.getMarketplaceID() self.headers = GlobalTools.getHeaders() self.url = GlobalTools.getSearchShopProductsUrl() self.seller = seller
def newfba(asin): headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.8", "Cache-Control": "no-cache", "Connection": "keep-alive", # "Content-Length":"547", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", # "Content-Type": "text/html;charset=UTF-8", "Host": "www.amazon.co.uk", "Origin": "https://www.amazon.co.uk", "Content-Encoding": "br", "Pragma": "no-cache", "Referer": "https://www.amazon.co.uk/gp/cart/view.html/ref=lh_cart_vc_btn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36", "X-AUI-View": "Desktop", "X-Requested-With": "XMLHttpRequest", "Cookie": """s_nr=1507528377078-New; s_vnum=1939528377079%26vn%3D1; s_dslv=1507528377080; x-wl-uid=1Vm2WQeeHRHErocdZmw5/GK41LYoPF67ySPkncJEPAiRRhWfNF0OyPa9yuT4S7+FNdyHQwhizugO0QMrffNe4I2JzXtJIy14CzCSmvUSme8lqhoZjjh77OF8sXJ/jQGXBcjMLuoUEESU=; at-acbuk=Atza|IwEBIOoVGprM0g2Qazrt-ifX53XNbsi7XFYs1OZNmIDgeZD6a5i2s7p4JuLWL6fC30oebF1OGUvU7z7HI266F0nMzVdpN8mWBQ1uOoa0XcmqZYdODKvv57Rq3jARRIaOoqkDAS6Ke6QFIjp1s1V6ZnPftLOOaz9uKLjRlvbMvtD57XnNZq2blSLo8IqJh0BhgpIH1K7cfEd7zgHGInlid0GyjKhMTaN5oRoZEzbvHAl9aHx15bRG8rKSbqpHQMeylRnYRnOirQGFgyPs2zQUp6YtUbivSlb8LGmOXL8aQaqZSE2lwyI3Sy9cGtDbBucHLB-OK4t89Rf5NIMRMSM-uMddzWr504Cg7_bOJ6RZFABsEDvdDEIItPRgnhrDksbMefih0AQSF8jnS9xXg3UbX9tqRbjA; amznacsleftnav-328511b4-0414-31f1-91c6-bca31c30900c=1; x-acbuk="DpcKYrm9@Uw75TNjTsXwmX79eaa3NMP2dk5ZlsVntw6MXujQjHcGEerpfDRFK8hR";session-token=9SQ2EeLcEOiWNXk9Km/DNS6S1V0UZwProvVruiPJrCVgmxhyesgqA/fp58r9T9x2sKqlQqrsEEER26oL2mWsLSDfPDsZIgbKwHiWox5/i0IB0R8heds6DI1HK15chFLvoLUg/J8JaqgwtAoINSoQpvXPRngz83hB73b9x54TmuIuxH8LyuVsQlHkt5CeOaWAKHpif0qNYASaMLmf/Q0EDRW8RO0yBFk+SPYTIZwRv8wy4200Mchhe4UhrsdJOX4aubGsciZgiUtFN7fjp4F4NQ=="; lc-acbuk=en_GB; ubid-acbuk=261-6573040-2508135; session-id-time=2082758401l; session-id=259-7896668-2728509; csm-hit=DQ3DSN2G6C2P8DBSE4K4+s-4CDTDE03S82FARC6XGS1|1514455697049""" } url = "https://www.amazon.co.uk/gp/cart/ajax-update.html/ref=ox_sc_update_quantity_1%7C9%7C11" data = { "hasMoreItems": 0, "timeStamp": 1514454024, "token": "gFHNsVRD27zMiOpe+yYpwFsAOZohN8u+a5VmqKkAAAAJAAAAAFpEvAhyYXcAAAAA", "activeItems": "C31HAVQP205TNO|1|0|5|3.05|||0||", "addressId": "", "addressZip": "", "closeAddonUpsell": 1, "flcExpanded": 0, "quantity.C31HAVQP205TNO": "11", "pageAction": "update-quantity", "submit.update-quantity.C31HAVQP205TNO": "1", "actionItemID": "C31HAVQP205TNO", "requestID": "EFHWWNTW6V3PRPMTQVWY", "asin": "B003KN7PU2", "encodedOffering": "%2BMwdK243Pp3oHjtzeyP6rdX8pnsybQAfRMa%2FX803XTXSTS7T%2BThAv741wG3TqvzM2kBUhnHpgojcF03P1%2FiSGuiN%2F5D6331v80WV2YLu2HU%3D" } # headers = urllib.quote(json.dumps(headers)) comm_params = urllib.parse.quote(json.dumps(data)) request = requests.session() request.get("https://www.amazon.co.uk", headers=GlobalTools.getHeaders()) # res = request.post(url,headers=headers,data=comm_params) res = requests.post(url, headers=headers, data=comm_params) print(res.content) jsonobj = json.loads(res.content, encoding="utf-8") print(jsonobj['features']['imb']) print(jsonobj['features']['nav-cart'])