def get_reviews_url_asin(self, goods_html): result = '' xpath_list = [ '//*[text()="Customer Reviews"]/../..//a[contains(text(), "customer reviews")]/@href', ] result_list = ReviewsParser.get_new_data(xpath_list=xpath_list, html_code=goods_html) if len(result_list) > 0: pattern_list = [ re.compile('product-reviews/([A-Za-z0-9]{10,10})/ref'), ] result_list1 = ReviewsParser.get_new_data( pattern_list=pattern_list, html_code=result_list[0]) if len(result_list1) > 0: result = result_list1[0] print('get_reviews_url.result: ', result) return result
def parser(self, html, html_type='', asin='', ip='', url='', ua='', info_log=None, debug_log=None, monitor_type=0, cookie=None, tosellSum=None, goods_html_code=None): '''只写了商品、评论、跟卖, 评论的html要求是一个html_list其余模块根据需要覆写此方法''' result = () is_error = False if html_type == 'goods': try: goods_datas, bsr_data = GoodsParser().parser_goods( html, asin, monitor_type, ip=ip, ua=ua, debug_log=debug_log, download_url=url, cookies=cookie) result = (goods_datas, bsr_data) except Exception as e: is_error = True self.debug_log.error('[%s] goods parser解析 [%s] 时 [%s]' % (ip, url, e)) if html_type == 'reviews': try: reviews_datas = ReviewsParser().reviews_parser( html, asin, ip=ip, download_url=url) result = (reviews_datas, ) except Exception as e: is_error = True self.debug_log.error('[%s] reviews parser解析 [%s] 时 [%s]' % (ip, url, e)) if html_type == 'tosell': try: tosell_info = TosellParser().tosell_parser( html, asin, tosellSum, ip=ip, download_url=url, goods_html_code=goods_html_code) result = (tosell_info, ) except Exception as e: is_error = True self.debug_log.error('[%s] tosell parser解析 [%s] 时 [%s]' % (ip, url, e)) return result, is_error
def looking_something(self, html_code): not_found_patterns = [ re.compile('The Web address you entered is not a functioning', re.S), re.compile('Looking for something', re.S), ] not_found = ReviewsParser.get_new_data(pattern_list=not_found_patterns, html_code=html_code) if len(not_found) > 0: return True return False
def make_url(asin, url_type='goods', urltitle='', sessionId=''): url_tuple = tuple() if url_type == 'goods': url, referer = GoodsParser.make_goods_url(asin, urltitle=urltitle, sessionId=sessionId) url_tuple = (url, referer) if url_type == 'reviews': url = ReviewsParser.make_reviews_url(asin, urltitle=urltitle) url_tuple = (url, ) if url_type == 'tosell': url = TosellParser.make_tosell_url(asin) url_tuple = (url, ) return url_tuple
def is_not_turn_the_page(self, first, html, page_num=0, asin=''): ''' 判断规则, 如果不第一次下载, 单页内出现前一天的评论, 则判断为不需要翻页了. 如果是第一次下载, 单页内出现三个月以前的评论, 也判断不再需要翻页. ''' reviews_date_list = ReviewsParser.get_reviews_date_list(html) datetime = return_PST() min_reviews_date = min( reviews_date_list) if len(reviews_date_list) > 0 else int( datetime.strftime('%Y%m%d')) oldDate = datetime - timedelta(days=90) yesterdate = datetime - timedelta(days=1) yesterday = yesterdate.strftime('%Y%m%d') theYesterDete = int(yesterday) theMon = oldDate.strftime('%Y%m%d') three_mon_date = int(theMon) print( '\n%s: min_reviews_date: %s\ntheYesterDete: %s\nthree_mon_date: %s\n' % (asin, min_reviews_date, theYesterDete, three_mon_date)) # 如果不是第一次下载 # 如果redis崩了, 数据丢了, 要从数据库查询, 修复此数据 if not first: if min_reviews_date < theYesterDete: print('%s < %s' % (min_reviews_date, theYesterDete)) print('非第一次下载, 当前是%s的第%s页评论, 不再需要继续翻页' % (asin, page_num)) return True else: print('是第一次下载, 当前是%s的第%s页评论, 还需要继续翻页' % (asin, page_num)) return False else: if min_reviews_date < three_mon_date: print('%s < %s' % (min_reviews_date, three_mon_date)) print('非第一次下载, 当前是%s的第%s页评论, 不再需要继续翻页' % (asin, page_num)) return True else: print('非第一次下载, 当前是%s的第%s页评论, 还需要继续翻页' % (asin, page_num)) return False
def make_page_url(self, asin, page): url = ReviewsParser.make_reviews_page_url(asin, page) return url
def get_reviews_html_list(self, asin, url, referer, **kwargs): print('\n' * 4, '*' * 20) print(self.get_reviews_html_list.__name__, asin, url, referer, **kwargs) print('*' * 20, '\n' * 4) self.html_list = [] self.url_list = [] self.is_error_list = [] self.cookies = None html, cookiesObj, is_error = self.get_html(url, referer=referer) reviewsUrl = '' url_asin = asin if self.looking_something(html) or not html: goodsUrl_tuple = self.make_url(asin, url_type='goods') goodsUrl, goodsReferer = goodsUrl_tuple[0], goodsUrl_tuple[1] print('get_reviews_html_list.goodsUrl: ', goodsUrl) goods_html, cookiesObj, is_error = self.get_html( goodsUrl, referer=goodsReferer, url_type=self.url_type, asin=asin) url_asin = self.get_reviews_url_asin(goods_html) reviewsUrl = self.make_url(url_asin, url_type=self.url_type)[0] # print(reviewsUrl, goodsUrl) if not reviewsUrl: DataOutput.record_not_found_reviews(asin) self.is_error_list.append(404) return self.html_list, self.url_list, self.cookies, self.is_error_list if reviewsUrl: print('%s get_reviews_html_list.reviewsUrl: ' % (asin), reviewsUrl) goodsUrl_tuple = self.make_url(url_asin, url_type='goods') goodsUrl = goodsUrl_tuple[0] # html, cookiesObj, is_error = self.get_html(reviewsUrl, ua, cookie, goodsUrl, url_type=self.url_type, asin=asin) html, cookiesObj, is_error = self.get_html(reviewsUrl, url_type=self.url_type, asin=asin) if ReviewsParser.is_page_not_found(html): DataOutput.record_not_found_reviews(asin) self.is_error_list.append(404) return self.html_list, self.url_list, self.cookies, self.is_error_list self.url_list.append(url) self.html_list.append(html) self.is_error_list.append(is_error) md5value = asin + 'reviewsFirst' md5key = self.get_md5_key(md5value) first = self.dataQ.is_first_download(md5key) # 先获取评论总数 reviewsSum = ReviewsParser.get_review_count(html) print('reviewsSum: ', reviewsSum, asin) if reviewsSum > 10: # 如果不需要翻页, 则直接return, 减少没必要的网络请求 if self.is_not_turn_the_page(first, html, page_num=1, asin=asin): return self.html_list, self.url_list, self.cookies, self.is_error_list # 如果是第一次下载评论 review: ['174'] if first: is_frist = 1 else: is_frist = 0 # 获取翻页url page_url_list = self.get_page_urls(url_asin, reviewsSum, frist=is_frist) tList = [] tStart = 0 # 遍历下载评论(多线程) if page_url_list: i = 1 j = 1 referer = url for page_url in page_url_list: i += 1 j += 1 print('reviews page%s: [%s]' % (i, page_url)) print('referer %s: [%s]' % (i, referer)) self.get_page_html(page_url, referer) referer = page_url for html in self.html_list: if self.is_not_turn_the_page(first, html, page_num=j, asin=asin): return self.html_list, self.url_list, self.cookies, self.is_error_list return self.html_list, self.url_list, self.cookies, self.is_error_list
def get_reviews_html_list(self, asin, url, ua, ip, cookie, referer): self.html_list = [] self.url_list = [] self.is_error_list = [] self.cookies = None print(url, ua) # url = 'https://www.amazon.com/product-reviews/B000TZ8TEU/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews&sortBy=recent' html, cookiesObj, is_error = self.get_html(url, ua, ip, cookie, referer) reviewsUrl = '' url_asin = asin if self.looking_something(html) or not html: goodsUrl_tuple = self.make_url(asin, url_type='goods') goodsUrl, goodsReferer = goodsUrl_tuple[0], goodsUrl_tuple[1] print('get_reviews_html_list.goodsUrl: ', goodsUrl) goods_html, cookiesObj, is_error = self.get_html( goodsUrl, ua, ip, cookie, goodsReferer, url_type=self.url_type, asin=asin) url_asin = self.get_reviews_url_asin(goods_html) reviewsUrl = self.make_url(url_asin, url_type=self.url_type)[0] # print(reviewsUrl, goodsUrl) if not reviewsUrl: DataOutput.record_not_found_reviews(asin) self.is_error_list.append(404) return self.html_list, self.url_list, self.cookies, self.is_error_list if reviewsUrl: print('%s get_reviews_html_list.reviewsUrl: ' % (asin), reviewsUrl) goodsUrl_tuple = self.make_url(url_asin, url_type='goods') goodsUrl = goodsUrl_tuple[0] html, cookiesObj, is_error = self.get_html(reviewsUrl, ua, ip, cookie, goodsUrl, url_type=self.url_type, asin=asin) if ReviewsParser.is_page_not_found(html): DataOutput.record_not_found_reviews(asin) self.is_error_list.append(404) return self.html_list, self.url_list, self.cookies, self.is_error_list if cookiesObj and not cookie: cookie = cookiesObj self.url_list.append(url) self.html_list.append(html) self.is_error_list.append(is_error) md5value = asin + 'reviewsFirst' md5key = self.get_md5_key(md5value) first = dataQ.is_first_download(md5key) # 先获取评论总数 reviewsSum = ReviewsParser.get_review_count(html) print('reviewsSum: ', reviewsSum, asin) if reviewsSum > 10: # 如果不需要翻页, 则直接return, 减少没必要的网络请求 if self.is_not_turn_the_page(first, html, page_num=1, asin=asin): return self.html_list, self.url_list, cookie, self.is_error_list # 如果是第一次下载评论 review: ['174'] if first: is_frist = 1 else: is_frist = 0 # 获取翻页url page_url_list = self.get_page_urls(url_asin, reviewsSum, frist=is_frist) tList = [] tStart = 0 # 遍历下载评论(多线程) if page_url_list: i = 1 j = 1 referer = url for page_url in page_url_list: i += 1 j += 1 print('reviews page%s: [%s]' % (i, page_url)) print('referer %s: [%s]' % (i, referer)) self.get_page_html(page_url, ua, ip, None, referer) referer = page_url for html in self.html_list: if self.is_not_turn_the_page(first, html, page_num=j, asin=asin): return self.html_list, self.url_list, cookie, self.is_error_list return self.html_list, self.url_list, self.cookies, self.is_error_list