def review_monitor(self, asin, country='us'): try: proxy = pro_chi() url_start = country_url(country) headers['Host'] = country_url(country).split("/")[2] page = requests.get( url_start + 'product-reviews/' + asin + '/ref=cm_cr_arp_d_viewopt_srt?sortBy=recent&pageNumber=1', headers=headers, proxies=proxy) review_monitor = {} for i in range(10): star = Selector(text=page.content).xpath( ".//*[@class='a-section review']/div/div[1]/a[1]/@title" ).extract()[i][0:3] customer_id = (Selector(text=page.content).xpath( ".//*[@class='a-section review']/div/div[2]/span/a/@href"). extract()[i]).split("/")[3] review_time = Selector(text=page.content).xpath( ".//*[@class='a-section review']/div/div[2]/span[4]/text()" ).extract()[i][3:] review_monitor[i] = { 'star': star, 'customer_id': customer_id, 'review_time': review_time } print(review_monitor) except Exception as e: dt = datetime.datetime.now(pytz.utc) + datetime.timedelta(seconds=30) self.retry(eta=dt, exc=e, max_retries=3)
def review_monitor(asin, country='us'): #try: #proxy=pro_chi() url_start = country_url('us') headers['Host'] = country_url('us').split("/")[2] url = url_start + 'product-reviews/' + 'B013US9FFY' + '/ref=cm_cr_arp_d_viewopt_srt?sortBy=recent&pageNumber=1' print(url) page = requests.get(url, headers=headers) page = page.text.encode(page.encoding).decode('utf-8') print(page) review_monitor = {} tree = fromstring(page.content) print(tree.findtext('.//title')) print(Selector(text=page.content).xpath(".//*[@class='a-section review']")) for i in range(10): star = Selector(text=page.content).xpath( ".//*[@class='a-section review']/div/div[1]/a[1]/@title").extract( )[i][0:3] customer_id = (Selector(text=page.content).xpath( ".//*[@class='a-section review']/div/div[2]/span/a/@href").extract( )[i]).split("/")[3] review_time = Selector(text=page.content).xpath( ".//*[@class='a-section review']/div/div[2]/span[4]/text()" ).extract()[i][3:] review_monitor[i] = { 'star': star, 'customer_id': customer_id, 'review_time': review_time } print(review_monitor)
def asin_title(asin, country): headers['Host'] = country_url(country).split("/")[2] url_start = country_url(country) page = s.get(url_start + 'gp/offer-listing/' + asin + '/ref=olp_page_5?ie=UTF8&startIndex=500', headers=headers) return page
def save(self, *args, **kwargs): if not self.customer_url: self.customer_url = "%sgp/pdp/profile/%s/ref=cm_cr_arp_d_pdp?ie=UTF8" % ( country_url(self.product.country), self.customer_id) try: if self.review_time: self.review_time = time.strftime( '%Y-%m-%d', time.strptime(self.review_time, '%B %d, %Y')) except: pass super(Review_detail, self).save(*args, **kwargs)
def save(self, *args, **kwargs): if not self.customer_url: self.customer_url = "%sgp/pdp/profile/%s/ref=cm_cr_arp_d_pdp?ie=UTF8" % ( country_url(self.product.country), self.customer_id) super(Review, self).save(*args, **kwargs)
def qa_collect(self,asin,country='us'): try: product=Product.objects.filter(asin=asin,country=country)[0] page=get_url('ask/questions/asin/'+asin+'/ref=ask_ql_psf_ql_hza?sort=SUBMIT_DATE', country) tree = fromstring(page.content) print(tree.findtext('.//title')) if tree.findtext('.//title')=='Robot Check' or tree.findtext('.//title')=='Amazon CAPTCHA': info = {'to':0} return info if Selector(text=page.content).xpath('.//*[@id="noResultsTitle"]'): info = {'to':0} return info qa_collection={} if Selector(text=page.content).xpath("//ul[@class='a-pagination']/li[@class='a-last']//a/@href"): page_num=0 while True: boxes=Selector(text=page.content).xpath(".//*[@class='a-section askTeaserQuestions']/div[@class='a-fixed-left-grid a-spacing-base']") for box in boxes: answer_url,answer,answer_user,qa_time=None,None,None,None vote=int(box.xpath(".//ul[@class='vote voteAjax']/li[2]/span[1]/text()").extract()[0]) question=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-small']//a[@class='a-link-normal']/text()").extract()[0] try: qa_time=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[2]/text()").extract()[-1:][0] except: pass try: if box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract() and country != 'jp': answer=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract()[0] elif box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract() and country == 'jp': answer=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract()[0] if answer == "": try: answer=" ".join(box.xpath(".//span[@class='askLongText']/text()").extract()).strip() except: pass else: answer=" ".join(box.xpath(".//span[@class='askLongText']/text()").extract()).strip() except: pass try: answer_user=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[2]/text()").extract()[0] except: pass try: answer_quan=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-section a-spacing-none a-spacing-top-mini']/a/text()").extract()[0] answer_quan = re.search(r'\d+', answer_quan).group(0) except: pass try: answer_url=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-section a-spacing-none a-spacing-top-mini']/a/@href").extract()[0] answer_url=country_url(country)[:-1]+answer_url #print("answer_url:",answer_url) except: pass #print(answer_user,qa_time) if answer_user == None: pass elif answer_user==qa_time: if country in['us','uk','ca','de']: name_date=re.split(' on |By |Von | am ', answer_user) elif country=='it': name_date=re.split(' in |Da ', answer_user) elif country=='fr': name_date=re.split(' le |Par ', answer_user) elif country=='es': name_date=re.split(' el |Por ', answer_user) elif country=='jp': name_date=re.split('投稿者: |、投稿日: ', answer_user) answer_user=name_date[1] qa_time=name_date[2] else: answer_user=re.split(' on |By |Von | am ', answer_user)[-1:][0] qa_time=re.split(' on |By |Von | am ', qa_time)[-1:][0] if answer_url and answer_quan: qa_collection[question]={'vote':vote,'question':question,'qa_time':qa_time.strip(),'answer':answer,'answer_user':answer_user.strip(),'answer_quan':answer_quan,'answer_url':answer_url} elif answer: qa_collection[question]={'vote':vote,'question':question,'qa_time':qa_time.strip(),'answer':answer,'answer_user':answer_user.strip()} print(len(qa_collection)) if Selector(text=page.content).xpath("//ul[@class='a-pagination']/li[@class='a-last']//a/@href") and page_num<200: time.sleep(2+random.random()*5) page=get_url((Selector(text=page.content).xpath("//ul[@class='a-pagination']/li[@class='a-last']//a/@href")).extract()[0],country=country) page_num += 1 else: break else: boxes=Selector(text=page.content).xpath(".//*[@class='a-section askTeaserQuestions']/div[@class='a-fixed-left-grid a-spacing-base']") for box in boxes: answer_url,answer,answer_user,qa_time=None,None,None,None vote=int(box.xpath(".//ul[@class='vote voteAjax']/li[2]/span[1]/text()").extract()[0]) question=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-small']//a[@class='a-link-normal']/text()").extract()[0] try: qa_time=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[2]/text()").extract()[-1:][0] except: pass try: if box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract(): answer=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract()[0] else: answer=" ".join(box.xpath(".//span[@class='askLongText']/text()").extract()).strip() except: pass try: answer_user=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[2]/text()").extract()[0] except: pass try: answer_quan=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-section a-spacing-none a-spacing-top-mini']/a/text()").extract()[0] answer_quan = re.search(r'\d+', answer_quan).group(0) except: pass try: answer_url=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-section a-spacing-none a-spacing-top-mini']/a/@href").extract()[0] answer_url=country_url(country)[:-1]+answer_url except: pass if answer_user == None: pass elif answer_user==qa_time: if country in['us','uk','ca','de']: name_date=re.split(' on |By |Von | am ', answer_user) elif country=='it': name_date=re.split(' in |Da ', answer_user) elif country=='fr': name_date=re.split(' le |Par ', answer_user) elif country=='es': name_date=re.split(' el |Por ', answer_user) elif country=='jp': name_date=re.split('投稿者: |、投稿日: ', answer_user) answer_user=name_date[1] qa_time=name_date[2] else: answer_user=re.split(' on |By |Von | am ', answer_user)[-1:][0] qa_time=re.split(' on |By |Von | am ', qa_time)[-1:][0] if answer_url and answer_quan: qa_collection[question]={'vote':vote,'question':question,'qa_time':qa_time,'answer':answer,'answer_user':answer_user,'answer_quan':answer_quan,'answer_url':answer_url} elif answer: qa_collection[question]={'vote':vote,'question':question,'qa_time':qa_time,'answer':answer,'answer_user':answer_user} for qa in qa_collection: try: num=qa_collection[qa]['answer_quan'] except: num="1" try: #if qa_collection[qa]['answer_url']: QA_detail.objects.get_or_create(product=product,vote=qa_collection[qa]['vote'],question=qa_collection[qa]['question'],qa_time=qa_collection[qa]['qa_time'], answer=qa_collection[qa]['answer'],answer_person=qa_collection[qa]['answer_user'],num=num,answer_url=qa_collection[qa]['answer_url']) except: QA_detail.objects.get_or_create(product=product,vote=qa_collection[qa]['vote'],question=qa_collection[qa]['question'],qa_time=qa_collection[qa]['qa_time'], answer=qa_collection[qa]['answer'],answer_person=qa_collection[qa]['answer_user'],num=num) #except: # pass #report = GlucoseCsvReport(product) #report.email(product.user, 'subject', 'message') except Exception as e: dt = datetime.now(pytz.utc) + timedelta(seconds=40) self.retry(eta=dt, exc=e, max_retries=2)
def title_sellers(page, product, country, initial): sell_items = {} counter = 12 not_first = False try: if Product_seller.objects.filter(product=product): not_first = True except: not_first = False while counter > 0: counter -= 1 for i in range(10): try: #//*[@id="olpOfferList"]/div/div/div[2]/div[3]/h3/span/a if Selector(text=page.content).xpath( ".//*[@id='olpOfferList']/div/div/div[" + str(i + 2) + "]/div[4]/h3/span//a/@href"): seller = (Selector(text=page.content).xpath( ".//*[@id='olpOfferList']/div/div/div[" + str(i + 2) + "]/div[4]/h3/span//a/@href").extract()[0] ).split("=")[-1] sell_url = country_url(country)[:-1] + (Selector( text=page.content).xpath( ".//*[@id='olpOfferList']/div/div/div[" + str(i + 2) + "]/div[4]/h3/span//a/@href").extract()[0]) name = fromstring(page.content).findtext( './/*[@id="olpOfferList"]/div/div/div[' + str(i + 2) + ']/div[4]/h3/span/a') else: seller = (Selector(text=page.content).xpath( ".//*[@id='olpOfferList']/div/div/div[" + str(i + 2) + "]/div[3]/h3/span//a/@href").extract()[0] ).split("=")[-1] sell_url = country_url(country)[:-1] + (Selector( text=page.content).xpath( ".//*[@id='olpOfferList']/div/div/div[" + str(i + 2) + "]/div[3]/h3/span//a/@href").extract()[0]) name = fromstring(page.content).findtext( './/*[@id="olpOfferList"]/div/div/div[' + str(i + 2) + ']/div[3]/h3/span/a') price = fromstring(page.content).findtext( './/*[@id="olpOfferList"]/div/div/div[' + str(i + 2) + ']/div[1]/span[1]').strip() sell_items[seller] = { 'seller': seller, 'name': name, 'price': price, 'sell_url': sell_url } print(sell_items) except: print('寻找页面元素的逻辑错误') #product_seller.mark_time=datetime.datetime.now() if Selector(text=page.content).xpath( "//ul[@class='a-pagination']//a/@href") != '#' and Selector( text=page.content).xpath( "//ul[@class='a-pagination']//a/@href"): url_path = Selector(text=page.content).xpath( "//ul[@class='a-pagination']//a/@href").extract()[0] page = get_url(url_path, country=country) if not sell_items: return "没有跟卖卖家" if initial: for seller_id in sell_items: product_seller = Product_seller( product=product, name=sell_items[seller_id]['name'], seller_id=seller_id, price=sell_items[seller_id]['price'], sell_url=sell_items[seller_id]['sell_url']) product_seller.save() else: changed = False for seller_id in sell_items: if not Product_seller.objects.filter(product=product, seller_id=seller_id): product_seller = Product_seller( product=product, name=sell_items[seller_id]['name'], sell_url=sell_items[seller_id]['sell_url'], seller_id=seller_id, price=sell_items[seller_id]['price']) product_seller.save() if not_first and not Seller_change.objects.filter( product=product, status='old', created__gte=timezone.now() - datetime.timedelta(days=1)): seller_change = Seller_change( product=product, status='new', name=sell_items[seller_id]['name'], sell_url=sell_items[seller_id]['sell_url'], seller_id=seller_id, price=sell_items[seller_id]['price'], created=datetime.datetime.now()) seller_change.save() changed = True for seller_id in sell_items: if Product_seller.objects.filter(product=product, seller_id=seller_id): Product_seller.objects.filter( product=product, seller_id=seller_id).update(flag=True) if changed and not_first: product_to_user = Product_to_user.objects.filter(product=product) users = product_to_user.values_list('user', flat=True) User = get_user_model() users = User.objects.filter(id__in=users) seller_change = Seller_change.objects.filter( product=product, created__gte=timezone.now() - datetime.timedelta(minutes=3)) sellers = seller_change.values_list('name', flat=True) for user in users: message = "\n".join([ u'{0},您好.'.format(user.username), u'{0}有跟卖出现:'.format(product.title), u'跟卖商家:', ','.join([seller for seller in sellers]), u'详情请见:', '/'.join(['amz668.com/follow_sale', product.slug]), u'直达亚马逊:{0}'.format(page.url) ]) send_email(user.email, message, '出现新的跟卖商品') for product_seller in Product_seller.objects.filter(product=product, flag=False): seller_change = Seller_change(product=product, status='old', name=product_seller.name, seller_id=product_seller.seller_id, price=product_seller.price, created=datetime.datetime.now()) seller_change.save() Product_seller.objects.filter(product=product, flag=False).delete() Product_seller.objects.filter(product=product).update(flag=False)
def rank_in_web(self,asin,keyword,country='us'): try: #proxy=pro_chi() #proxy=proxies info = {'rank':None,'page':None,'sponsored':False} keyword_rank=Keyword_rank.objects.filter(asin=asin,country=country,word=keyword)[0] url_start=country_url(country) headers['Host']=country_url(country).split("/")[2] s = requests.Session() page = s.get(url_start+'s?field-keywords='+keyword,headers=headers,proxies=pro_chi()) keyword_rank.rank_url=page.url keyword_rank.save() if page.status_code != 200: raise Exception if Selector(text=page.content).xpath('.//*[@id="noResultsTitle"]'): raise Exception #print(page.status_code, page.text[:200]) flag_1,flag_2 = True,True #//*[@id="s-result-count"] //*[@id="s-result-count"] if country=='jp': try: #len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]')) >= \ #int(Selector(text=page.content).xpath('.//*[@id="s-result-count"]/text()').extract()[0][-4:-2]): item_amount=int(Selector(text=page.content).xpath('.//*[@id="s-result-count"]/text()').extract()[0][-4:-2]) except: flag_1 = False item_amount=len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]')) else: if len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]')): #if len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]')) >= \ #int(Selector(text=page.content).xpath('.//*[@id="s-result-count"]/text()').extract()[0][2:4]): try: item_amount=int(Selector(text=page.content).xpath('.//*[@id="s-result-count"]/text()').extract()[0][2:4]) except: flag_1 = False item_amount=len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]')) else: flag_2 = False #if len(Selector(text=page.content).xpath('.//li[@class="s-result-item s-result-card-for-container a-declarative celwidget "]')) >= \ #int(Selector(text=page.content).xpath('.//*[@id="s-result-count"]/text()').extract()[0][2:4]): try: item_amount=int(Selector(text=page.content).xpath('.//*[@id="s-result-count"]/text()').extract()[0][2:4]) except: flag_1 = False item_amount=len(Selector(text=page.content).xpath('.//li[@class="s-result-item s-result-card-for-container a-declarative celwidget "]')) print(item_amount) tree = fromstring(page.content) print(tree.findtext('.//title')) if tree.findtext('.//title')=='Robot Check' or tree.findtext('.//title')=='Amazon CAPTCHA': if len(keyword_rank.rank.all())>2: rank=Rank(page=keyword_rank.rank.first().page,number=keyword_rank.rank.first().number,sponsored=keyword_rank.rank.first().sponsored,keyword_rank=keyword_rank) rank.save() info = {'rank':0,'page':0,'sponsored':True} return info else: rank=Rank(keyword_rank=keyword_rank,page=0,number=0,sponsored=False) rank.save() info = {'rank':0,'page':0,'sponsored':True} return info #.//li[@class="s-result-item s-result-card-for-container a-declarative celwidget "] if flag_1: page_num=0 while True: print(tree.findtext('.//title'),"page",page_num+1) if country=='jp': fanwei=range(len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]'))) elif flag_2: fanwei=range(len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]'))) else: fanwei=range(len(Selector(text=page.content).xpath('.//li[@class="s-result-item s-result-card-for-container a-declarative celwidget "]'))) for j in fanwei: #try: if Selector(text=page.content).xpath(".//*[@id='result_"+str(item_amount*page_num+j)+"']/@data-asin"): seller=(Selector(text=page.content).xpath(".//*[@id='result_"+str(item_amount*page_num+j)+"']/@data-asin").extract()[0]) else: print("在页面找商品的逻辑错误") print(seller) if seller==asin: try: if (Selector(text=page.content).xpath(".//*[@id='result_"+str(item_amount*page_num+j)+"']/div/div/div/div[2]/h5/text()").extract()[0]).exist(): info = {'rank':j+1,'page':page_num+1,'sponsored':True} except: info = {'rank':j+1,'page':page_num+1,'sponsored':False} print("搜索关键词%s:%s排名第%s页,第%s名" % (keyword,asin,page_num+1,j+1)) rank=Rank(keyword_rank=keyword_rank,page=info['page'],number=info['rank'],sponsored=info['sponsored']) rank.save() return info #except: # print('Sponsored:搜索关键词%s,排名第%s页,第%s名' % (keyword,page_num+1,j+1)) if Selector(text=page.content).xpath(".//*[@id='pagnNextLink']/@href") and page_num<20: time.sleep(2+random.random()*5) while True: try: page = s.get(url_start+(Selector(text=page.content).xpath(".//*[@id='pagnNextLink']/@href")).extract()[0],headers=headers,proxies=pro_chi()) except: print("Connection refused by the server..") print("Let me sleep for 5 seconds") print("ZZzzzz...") time.sleep(2+random.random()*5) print("Was a nice sleep, now let me continue...") continue page_num += 1 else: if len(keyword_rank.rank.all())>2: rank=Rank(page=keyword_rank.rank.first().page,number=keyword_rank.rank.first().number,sponsored=keyword_rank.rank.first().sponsored,keyword_rank=keyword_rank) rank.save() info = {'rank':0,'page':0,'sponsored':False} return info else: rank=Rank(keyword_rank=keyword_rank,page=0,number=0,sponsored=False) rank.save() info = {'rank':0,'page':0,'sponsored':False} return info else: if country=='jp': fanwei=range(len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]'))) elif flag_2: fanwei=range(len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]'))) else: fanwei=range(len(Selector(text=page.content).xpath('.//li[@class="s-result-item s-result-card-for-container a-declarative celwidget "]'))) for j in fanwei: try: if Selector(text=page.content).xpath(".//*[@id='result_"+str(j)+"']/@data-asin"): seller=(Selector(text=page.content).xpath(".//*[@id='result_"+str(j)+"']/@data-asin").extract()[0]) else: print("在页面找商品的逻辑错误") print(seller) if seller==asin: try: if (Selector(text=page.content).xpath(".//*[@id='result_"+str(j)+"']/div/div/div/div[2]/h5/text()").extract()[0])=="sponsored": info = {'rank':j+1,'page':1,'sponsored':True} except: info = {'rank':j+1,'page':1,'sponsored':False} print("搜索关键词%s:%s排名第1页,第%s名" % (keyword,asin,j+1)) rank=Rank(keyword_rank=keyword_rank,page=info['page'],number=info['rank'],sponsored=info['sponsored']) rank.save() return info except: if len(keyword_rank.rank.all())>2: rank=Rank(page=keyword_rank.rank.first().page,number=keyword_rank.rank.first().number,sponsored=keyword_rank.rank.first().sponsored,keyword_rank=keyword_rank) rank.save() info = {'rank':0,'page':0,'sponsored':False} return info else: rank=Rank(keyword_rank=keyword_rank,page=0,number=0,sponsored=False) rank.save() info = {'rank':0,'page':0,'sponsored':False} return info except Exception as e: dt = datetime.datetime.now(pytz.utc) + datetime.timedelta(seconds=40) self.retry(eta=dt, exc=e, max_retries=2)