def get_reviews_html_list(self, asin, url, referer, **kwargs): print('\n' * 4, '*' * 20) print(self.get_reviews_html_list.__name__, asin, url, referer, **kwargs) print('*' * 20, '\n' * 4) self.html_list = [] self.url_list = [] self.is_error_list = [] self.cookies = None html, cookiesObj, is_error = self.get_html(url, referer=referer) reviewsUrl = '' url_asin = asin if self.looking_something(html) or not html: goodsUrl_tuple = self.make_url(asin, url_type='goods') goodsUrl, goodsReferer = goodsUrl_tuple[0], goodsUrl_tuple[1] print('get_reviews_html_list.goodsUrl: ', goodsUrl) goods_html, cookiesObj, is_error = self.get_html( goodsUrl, referer=goodsReferer, url_type=self.url_type, asin=asin) url_asin = self.get_reviews_url_asin(goods_html) reviewsUrl = self.make_url(url_asin, url_type=self.url_type)[0] # print(reviewsUrl, goodsUrl) if not reviewsUrl: DataOutput.record_not_found_reviews(asin) self.is_error_list.append(404) return self.html_list, self.url_list, self.cookies, self.is_error_list if reviewsUrl: print('%s get_reviews_html_list.reviewsUrl: ' % (asin), reviewsUrl) goodsUrl_tuple = self.make_url(url_asin, url_type='goods') goodsUrl = goodsUrl_tuple[0] # html, cookiesObj, is_error = self.get_html(reviewsUrl, ua, cookie, goodsUrl, url_type=self.url_type, asin=asin) html, cookiesObj, is_error = self.get_html(reviewsUrl, url_type=self.url_type, asin=asin) if ReviewsParser.is_page_not_found(html): DataOutput.record_not_found_reviews(asin) self.is_error_list.append(404) return self.html_list, self.url_list, self.cookies, self.is_error_list self.url_list.append(url) self.html_list.append(html) self.is_error_list.append(is_error) md5value = asin + 'reviewsFirst' md5key = self.get_md5_key(md5value) first = self.dataQ.is_first_download(md5key) # 先获取评论总数 reviewsSum = ReviewsParser.get_review_count(html) print('reviewsSum: ', reviewsSum, asin) if reviewsSum > 10: # 如果不需要翻页, 则直接return, 减少没必要的网络请求 if self.is_not_turn_the_page(first, html, page_num=1, asin=asin): return self.html_list, self.url_list, self.cookies, self.is_error_list # 如果是第一次下载评论 review: ['174'] if first: is_frist = 1 else: is_frist = 0 # 获取翻页url page_url_list = self.get_page_urls(url_asin, reviewsSum, frist=is_frist) tList = [] tStart = 0 # 遍历下载评论(多线程) if page_url_list: i = 1 j = 1 referer = url for page_url in page_url_list: i += 1 j += 1 print('reviews page%s: [%s]' % (i, page_url)) print('referer %s: [%s]' % (i, referer)) self.get_page_html(page_url, referer) referer = page_url for html in self.html_list: if self.is_not_turn_the_page(first, html, page_num=j, asin=asin): return self.html_list, self.url_list, self.cookies, self.is_error_list return self.html_list, self.url_list, self.cookies, self.is_error_list
def get_reviews_html_list(self, asin, url, ua, ip, cookie, referer): self.html_list = [] self.url_list = [] self.is_error_list = [] self.cookies = None print(url, ua) # url = 'https://www.amazon.com/product-reviews/B000TZ8TEU/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews&sortBy=recent' html, cookiesObj, is_error = self.get_html(url, ua, ip, cookie, referer) reviewsUrl = '' url_asin = asin if self.looking_something(html) or not html: goodsUrl_tuple = self.make_url(asin, url_type='goods') goodsUrl, goodsReferer = goodsUrl_tuple[0], goodsUrl_tuple[1] print('get_reviews_html_list.goodsUrl: ', goodsUrl) goods_html, cookiesObj, is_error = self.get_html( goodsUrl, ua, ip, cookie, goodsReferer, url_type=self.url_type, asin=asin) url_asin = self.get_reviews_url_asin(goods_html) reviewsUrl = self.make_url(url_asin, url_type=self.url_type)[0] # print(reviewsUrl, goodsUrl) if not reviewsUrl: DataOutput.record_not_found_reviews(asin) self.is_error_list.append(404) return self.html_list, self.url_list, self.cookies, self.is_error_list if reviewsUrl: print('%s get_reviews_html_list.reviewsUrl: ' % (asin), reviewsUrl) goodsUrl_tuple = self.make_url(url_asin, url_type='goods') goodsUrl = goodsUrl_tuple[0] html, cookiesObj, is_error = self.get_html(reviewsUrl, ua, ip, cookie, goodsUrl, url_type=self.url_type, asin=asin) if ReviewsParser.is_page_not_found(html): DataOutput.record_not_found_reviews(asin) self.is_error_list.append(404) return self.html_list, self.url_list, self.cookies, self.is_error_list if cookiesObj and not cookie: cookie = cookiesObj self.url_list.append(url) self.html_list.append(html) self.is_error_list.append(is_error) md5value = asin + 'reviewsFirst' md5key = self.get_md5_key(md5value) first = dataQ.is_first_download(md5key) # 先获取评论总数 reviewsSum = ReviewsParser.get_review_count(html) print('reviewsSum: ', reviewsSum, asin) if reviewsSum > 10: # 如果不需要翻页, 则直接return, 减少没必要的网络请求 if self.is_not_turn_the_page(first, html, page_num=1, asin=asin): return self.html_list, self.url_list, cookie, self.is_error_list # 如果是第一次下载评论 review: ['174'] if first: is_frist = 1 else: is_frist = 0 # 获取翻页url page_url_list = self.get_page_urls(url_asin, reviewsSum, frist=is_frist) tList = [] tStart = 0 # 遍历下载评论(多线程) if page_url_list: i = 1 j = 1 referer = url for page_url in page_url_list: i += 1 j += 1 print('reviews page%s: [%s]' % (i, page_url)) print('referer %s: [%s]' % (i, referer)) self.get_page_html(page_url, ua, ip, None, referer) referer = page_url for html in self.html_list: if self.is_not_turn_the_page(first, html, page_num=j, asin=asin): return self.html_list, self.url_list, cookie, self.is_error_list return self.html_list, self.url_list, self.cookies, self.is_error_list