Beispiel #1
0
    def get_reviews_html_list(self, asin, url, referer, **kwargs):
        print('\n' * 4, '*' * 20)
        print(self.get_reviews_html_list.__name__, asin, url, referer,
              **kwargs)
        print('*' * 20, '\n' * 4)
        self.html_list = []
        self.url_list = []
        self.is_error_list = []
        self.cookies = None
        html, cookiesObj, is_error = self.get_html(url, referer=referer)
        reviewsUrl = ''
        url_asin = asin
        if self.looking_something(html) or not html:
            goodsUrl_tuple = self.make_url(asin, url_type='goods')
            goodsUrl, goodsReferer = goodsUrl_tuple[0], goodsUrl_tuple[1]
            print('get_reviews_html_list.goodsUrl: ', goodsUrl)
            goods_html, cookiesObj, is_error = self.get_html(
                goodsUrl,
                referer=goodsReferer,
                url_type=self.url_type,
                asin=asin)
            url_asin = self.get_reviews_url_asin(goods_html)
            reviewsUrl = self.make_url(url_asin, url_type=self.url_type)[0]
            # print(reviewsUrl, goodsUrl)
            if not reviewsUrl:
                DataOutput.record_not_found_reviews(asin)
                self.is_error_list.append(404)
                return self.html_list, self.url_list, self.cookies, self.is_error_list

        if reviewsUrl:
            print('%s get_reviews_html_list.reviewsUrl: ' % (asin), reviewsUrl)
            goodsUrl_tuple = self.make_url(url_asin, url_type='goods')
            goodsUrl = goodsUrl_tuple[0]
            # html, cookiesObj, is_error = self.get_html(reviewsUrl, ua, cookie, goodsUrl, url_type=self.url_type, asin=asin)
            html, cookiesObj, is_error = self.get_html(reviewsUrl,
                                                       url_type=self.url_type,
                                                       asin=asin)

        if ReviewsParser.is_page_not_found(html):
            DataOutput.record_not_found_reviews(asin)
            self.is_error_list.append(404)
            return self.html_list, self.url_list, self.cookies, self.is_error_list

        self.url_list.append(url)
        self.html_list.append(html)
        self.is_error_list.append(is_error)
        md5value = asin + 'reviewsFirst'
        md5key = self.get_md5_key(md5value)
        first = self.dataQ.is_first_download(md5key)
        # 先获取评论总数
        reviewsSum = ReviewsParser.get_review_count(html)
        print('reviewsSum: ', reviewsSum, asin)
        if reviewsSum > 10:
            # 如果不需要翻页, 则直接return, 减少没必要的网络请求
            if self.is_not_turn_the_page(first, html, page_num=1, asin=asin):
                return self.html_list, self.url_list, self.cookies, self.is_error_list
            # 如果是第一次下载评论 review:  ['174']
            if first:
                is_frist = 1
            else:
                is_frist = 0

            # 获取翻页url
            page_url_list = self.get_page_urls(url_asin,
                                               reviewsSum,
                                               frist=is_frist)

            tList = []
            tStart = 0
            # 遍历下载评论(多线程)
            if page_url_list:
                i = 1
                j = 1
                referer = url
                for page_url in page_url_list:
                    i += 1
                    j += 1
                    print('reviews page%s: [%s]' % (i, page_url))
                    print('referer %s: [%s]' % (i, referer))
                    self.get_page_html(page_url, referer)
                    referer = page_url
                    for html in self.html_list:
                        if self.is_not_turn_the_page(first,
                                                     html,
                                                     page_num=j,
                                                     asin=asin):
                            return self.html_list, self.url_list, self.cookies, self.is_error_list
        return self.html_list, self.url_list, self.cookies, self.is_error_list
    def get_reviews_html_list(self, asin, url, ua, ip, cookie, referer):
        self.html_list = []
        self.url_list = []
        self.is_error_list = []
        self.cookies = None
        print(url, ua)
        # url = 'https://www.amazon.com/product-reviews/B000TZ8TEU/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews&sortBy=recent'
        html, cookiesObj, is_error = self.get_html(url, ua, ip, cookie,
                                                   referer)
        reviewsUrl = ''
        url_asin = asin
        if self.looking_something(html) or not html:
            goodsUrl_tuple = self.make_url(asin, url_type='goods')
            goodsUrl, goodsReferer = goodsUrl_tuple[0], goodsUrl_tuple[1]
            print('get_reviews_html_list.goodsUrl: ', goodsUrl)
            goods_html, cookiesObj, is_error = self.get_html(
                goodsUrl,
                ua,
                ip,
                cookie,
                goodsReferer,
                url_type=self.url_type,
                asin=asin)
            url_asin = self.get_reviews_url_asin(goods_html)
            reviewsUrl = self.make_url(url_asin, url_type=self.url_type)[0]
            # print(reviewsUrl, goodsUrl)
            if not reviewsUrl:
                DataOutput.record_not_found_reviews(asin)
                self.is_error_list.append(404)
                return self.html_list, self.url_list, self.cookies, self.is_error_list

        if reviewsUrl:
            print('%s get_reviews_html_list.reviewsUrl: ' % (asin), reviewsUrl)
            goodsUrl_tuple = self.make_url(url_asin, url_type='goods')
            goodsUrl = goodsUrl_tuple[0]
            html, cookiesObj, is_error = self.get_html(reviewsUrl,
                                                       ua,
                                                       ip,
                                                       cookie,
                                                       goodsUrl,
                                                       url_type=self.url_type,
                                                       asin=asin)

        if ReviewsParser.is_page_not_found(html):
            DataOutput.record_not_found_reviews(asin)
            self.is_error_list.append(404)
            return self.html_list, self.url_list, self.cookies, self.is_error_list

        if cookiesObj and not cookie:
            cookie = cookiesObj
        self.url_list.append(url)
        self.html_list.append(html)
        self.is_error_list.append(is_error)
        md5value = asin + 'reviewsFirst'
        md5key = self.get_md5_key(md5value)
        first = dataQ.is_first_download(md5key)
        # 先获取评论总数
        reviewsSum = ReviewsParser.get_review_count(html)
        print('reviewsSum: ', reviewsSum, asin)
        if reviewsSum > 10:
            # 如果不需要翻页, 则直接return, 减少没必要的网络请求
            if self.is_not_turn_the_page(first, html, page_num=1, asin=asin):
                return self.html_list, self.url_list, cookie, self.is_error_list
            # 如果是第一次下载评论 review:  ['174']
            if first:
                is_frist = 1
            else:
                is_frist = 0

            # 获取翻页url
            page_url_list = self.get_page_urls(url_asin,
                                               reviewsSum,
                                               frist=is_frist)

            tList = []
            tStart = 0
            # 遍历下载评论(多线程)
            if page_url_list:
                i = 1
                j = 1
                referer = url
                for page_url in page_url_list:
                    i += 1
                    j += 1
                    print('reviews page%s: [%s]' % (i, page_url))
                    print('referer %s: [%s]' % (i, referer))
                    self.get_page_html(page_url, ua, ip, None, referer)
                    referer = page_url
                    for html in self.html_list:
                        if self.is_not_turn_the_page(first,
                                                     html,
                                                     page_num=j,
                                                     asin=asin):
                            return self.html_list, self.url_list, cookie, self.is_error_list

        return self.html_list, self.url_list, self.cookies, self.is_error_list