Esempio n. 1
0
 def get_reviews_url_asin(self, goods_html):
     result = ''
     xpath_list = [
         '//*[text()="Customer Reviews"]/../..//a[contains(text(), "customer reviews")]/@href',
     ]
     result_list = ReviewsParser.get_new_data(xpath_list=xpath_list,
                                              html_code=goods_html)
     if len(result_list) > 0:
         pattern_list = [
             re.compile('product-reviews/([A-Za-z0-9]{10,10})/ref'),
         ]
         result_list1 = ReviewsParser.get_new_data(
             pattern_list=pattern_list, html_code=result_list[0])
         if len(result_list1) > 0:
             result = result_list1[0]
     print('get_reviews_url.result: ', result)
     return result
Esempio n. 2
0
 def parser(self,
            html,
            html_type='',
            asin='',
            ip='',
            url='',
            ua='',
            info_log=None,
            debug_log=None,
            monitor_type=0,
            cookie=None,
            tosellSum=None,
            goods_html_code=None):
     '''只写了商品、评论、跟卖, 评论的html要求是一个html_list其余模块根据需要覆写此方法'''
     result = ()
     is_error = False
     if html_type == 'goods':
         try:
             goods_datas, bsr_data = GoodsParser().parser_goods(
                 html,
                 asin,
                 monitor_type,
                 ip=ip,
                 ua=ua,
                 debug_log=debug_log,
                 download_url=url,
                 cookies=cookie)
             result = (goods_datas, bsr_data)
         except Exception as e:
             is_error = True
             self.debug_log.error('[%s] goods parser解析 [%s] 时 [%s]' %
                                  (ip, url, e))
     if html_type == 'reviews':
         try:
             reviews_datas = ReviewsParser().reviews_parser(
                 html, asin, ip=ip, download_url=url)
             result = (reviews_datas, )
         except Exception as e:
             is_error = True
             self.debug_log.error('[%s] reviews parser解析 [%s] 时 [%s]' %
                                  (ip, url, e))
     if html_type == 'tosell':
         try:
             tosell_info = TosellParser().tosell_parser(
                 html,
                 asin,
                 tosellSum,
                 ip=ip,
                 download_url=url,
                 goods_html_code=goods_html_code)
             result = (tosell_info, )
         except Exception as e:
             is_error = True
             self.debug_log.error('[%s] tosell parser解析 [%s] 时 [%s]' %
                                  (ip, url, e))
     return result, is_error
Esempio n. 3
0
 def looking_something(self, html_code):
     not_found_patterns = [
         re.compile('The Web address you entered is not a functioning',
                    re.S),
         re.compile('Looking for something', re.S),
     ]
     not_found = ReviewsParser.get_new_data(pattern_list=not_found_patterns,
                                            html_code=html_code)
     if len(not_found) > 0:
         return True
     return False
Esempio n. 4
0
 def make_url(asin, url_type='goods', urltitle='', sessionId=''):
     url_tuple = tuple()
     if url_type == 'goods':
         url, referer = GoodsParser.make_goods_url(asin,
                                                   urltitle=urltitle,
                                                   sessionId=sessionId)
         url_tuple = (url, referer)
     if url_type == 'reviews':
         url = ReviewsParser.make_reviews_url(asin, urltitle=urltitle)
         url_tuple = (url, )
     if url_type == 'tosell':
         url = TosellParser.make_tosell_url(asin)
         url_tuple = (url, )
     return url_tuple
Esempio n. 5
0
 def is_not_turn_the_page(self, first, html, page_num=0, asin=''):
     '''
     判断规则, 如果不第一次下载, 单页内出现前一天的评论, 则判断为不需要翻页了.
     如果是第一次下载, 单页内出现三个月以前的评论, 也判断不再需要翻页.
     '''
     reviews_date_list = ReviewsParser.get_reviews_date_list(html)
     datetime = return_PST()
     min_reviews_date = min(
         reviews_date_list) if len(reviews_date_list) > 0 else int(
             datetime.strftime('%Y%m%d'))
     oldDate = datetime - timedelta(days=90)
     yesterdate = datetime - timedelta(days=1)
     yesterday = yesterdate.strftime('%Y%m%d')
     theYesterDete = int(yesterday)
     theMon = oldDate.strftime('%Y%m%d')
     three_mon_date = int(theMon)
     print(
         '\n%s: min_reviews_date: %s\ntheYesterDete: %s\nthree_mon_date: %s\n'
         % (asin, min_reviews_date, theYesterDete, three_mon_date))
     # 如果不是第一次下载
     # 如果redis崩了, 数据丢了, 要从数据库查询, 修复此数据
     if not first:
         if min_reviews_date < theYesterDete:
             print('%s < %s' % (min_reviews_date, theYesterDete))
             print('非第一次下载, 当前是%s的第%s页评论, 不再需要继续翻页' % (asin, page_num))
             return True
         else:
             print('是第一次下载, 当前是%s的第%s页评论, 还需要继续翻页' % (asin, page_num))
             return False
     else:
         if min_reviews_date < three_mon_date:
             print('%s < %s' % (min_reviews_date, three_mon_date))
             print('非第一次下载, 当前是%s的第%s页评论, 不再需要继续翻页' % (asin, page_num))
             return True
         else:
             print('非第一次下载, 当前是%s的第%s页评论, 还需要继续翻页' % (asin, page_num))
             return False
Esempio n. 6
0
 def make_page_url(self, asin, page):
     url = ReviewsParser.make_reviews_page_url(asin, page)
     return url
Esempio n. 7
0
    def get_reviews_html_list(self, asin, url, referer, **kwargs):
        print('\n' * 4, '*' * 20)
        print(self.get_reviews_html_list.__name__, asin, url, referer,
              **kwargs)
        print('*' * 20, '\n' * 4)
        self.html_list = []
        self.url_list = []
        self.is_error_list = []
        self.cookies = None
        html, cookiesObj, is_error = self.get_html(url, referer=referer)
        reviewsUrl = ''
        url_asin = asin
        if self.looking_something(html) or not html:
            goodsUrl_tuple = self.make_url(asin, url_type='goods')
            goodsUrl, goodsReferer = goodsUrl_tuple[0], goodsUrl_tuple[1]
            print('get_reviews_html_list.goodsUrl: ', goodsUrl)
            goods_html, cookiesObj, is_error = self.get_html(
                goodsUrl,
                referer=goodsReferer,
                url_type=self.url_type,
                asin=asin)
            url_asin = self.get_reviews_url_asin(goods_html)
            reviewsUrl = self.make_url(url_asin, url_type=self.url_type)[0]
            # print(reviewsUrl, goodsUrl)
            if not reviewsUrl:
                DataOutput.record_not_found_reviews(asin)
                self.is_error_list.append(404)
                return self.html_list, self.url_list, self.cookies, self.is_error_list

        if reviewsUrl:
            print('%s get_reviews_html_list.reviewsUrl: ' % (asin), reviewsUrl)
            goodsUrl_tuple = self.make_url(url_asin, url_type='goods')
            goodsUrl = goodsUrl_tuple[0]
            # html, cookiesObj, is_error = self.get_html(reviewsUrl, ua, cookie, goodsUrl, url_type=self.url_type, asin=asin)
            html, cookiesObj, is_error = self.get_html(reviewsUrl,
                                                       url_type=self.url_type,
                                                       asin=asin)

        if ReviewsParser.is_page_not_found(html):
            DataOutput.record_not_found_reviews(asin)
            self.is_error_list.append(404)
            return self.html_list, self.url_list, self.cookies, self.is_error_list

        self.url_list.append(url)
        self.html_list.append(html)
        self.is_error_list.append(is_error)
        md5value = asin + 'reviewsFirst'
        md5key = self.get_md5_key(md5value)
        first = self.dataQ.is_first_download(md5key)
        # 先获取评论总数
        reviewsSum = ReviewsParser.get_review_count(html)
        print('reviewsSum: ', reviewsSum, asin)
        if reviewsSum > 10:
            # 如果不需要翻页, 则直接return, 减少没必要的网络请求
            if self.is_not_turn_the_page(first, html, page_num=1, asin=asin):
                return self.html_list, self.url_list, self.cookies, self.is_error_list
            # 如果是第一次下载评论 review:  ['174']
            if first:
                is_frist = 1
            else:
                is_frist = 0

            # 获取翻页url
            page_url_list = self.get_page_urls(url_asin,
                                               reviewsSum,
                                               frist=is_frist)

            tList = []
            tStart = 0
            # 遍历下载评论(多线程)
            if page_url_list:
                i = 1
                j = 1
                referer = url
                for page_url in page_url_list:
                    i += 1
                    j += 1
                    print('reviews page%s: [%s]' % (i, page_url))
                    print('referer %s: [%s]' % (i, referer))
                    self.get_page_html(page_url, referer)
                    referer = page_url
                    for html in self.html_list:
                        if self.is_not_turn_the_page(first,
                                                     html,
                                                     page_num=j,
                                                     asin=asin):
                            return self.html_list, self.url_list, self.cookies, self.is_error_list
        return self.html_list, self.url_list, self.cookies, self.is_error_list
Esempio n. 8
0
    def get_reviews_html_list(self, asin, url, ua, ip, cookie, referer):
        self.html_list = []
        self.url_list = []
        self.is_error_list = []
        self.cookies = None
        print(url, ua)
        # url = 'https://www.amazon.com/product-reviews/B000TZ8TEU/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews&sortBy=recent'
        html, cookiesObj, is_error = self.get_html(url, ua, ip, cookie,
                                                   referer)
        reviewsUrl = ''
        url_asin = asin
        if self.looking_something(html) or not html:
            goodsUrl_tuple = self.make_url(asin, url_type='goods')
            goodsUrl, goodsReferer = goodsUrl_tuple[0], goodsUrl_tuple[1]
            print('get_reviews_html_list.goodsUrl: ', goodsUrl)
            goods_html, cookiesObj, is_error = self.get_html(
                goodsUrl,
                ua,
                ip,
                cookie,
                goodsReferer,
                url_type=self.url_type,
                asin=asin)
            url_asin = self.get_reviews_url_asin(goods_html)
            reviewsUrl = self.make_url(url_asin, url_type=self.url_type)[0]
            # print(reviewsUrl, goodsUrl)
            if not reviewsUrl:
                DataOutput.record_not_found_reviews(asin)
                self.is_error_list.append(404)
                return self.html_list, self.url_list, self.cookies, self.is_error_list

        if reviewsUrl:
            print('%s get_reviews_html_list.reviewsUrl: ' % (asin), reviewsUrl)
            goodsUrl_tuple = self.make_url(url_asin, url_type='goods')
            goodsUrl = goodsUrl_tuple[0]
            html, cookiesObj, is_error = self.get_html(reviewsUrl,
                                                       ua,
                                                       ip,
                                                       cookie,
                                                       goodsUrl,
                                                       url_type=self.url_type,
                                                       asin=asin)

        if ReviewsParser.is_page_not_found(html):
            DataOutput.record_not_found_reviews(asin)
            self.is_error_list.append(404)
            return self.html_list, self.url_list, self.cookies, self.is_error_list

        if cookiesObj and not cookie:
            cookie = cookiesObj
        self.url_list.append(url)
        self.html_list.append(html)
        self.is_error_list.append(is_error)
        md5value = asin + 'reviewsFirst'
        md5key = self.get_md5_key(md5value)
        first = dataQ.is_first_download(md5key)
        # 先获取评论总数
        reviewsSum = ReviewsParser.get_review_count(html)
        print('reviewsSum: ', reviewsSum, asin)
        if reviewsSum > 10:
            # 如果不需要翻页, 则直接return, 减少没必要的网络请求
            if self.is_not_turn_the_page(first, html, page_num=1, asin=asin):
                return self.html_list, self.url_list, cookie, self.is_error_list
            # 如果是第一次下载评论 review:  ['174']
            if first:
                is_frist = 1
            else:
                is_frist = 0

            # 获取翻页url
            page_url_list = self.get_page_urls(url_asin,
                                               reviewsSum,
                                               frist=is_frist)

            tList = []
            tStart = 0
            # 遍历下载评论(多线程)
            if page_url_list:
                i = 1
                j = 1
                referer = url
                for page_url in page_url_list:
                    i += 1
                    j += 1
                    print('reviews page%s: [%s]' % (i, page_url))
                    print('referer %s: [%s]' % (i, referer))
                    self.get_page_html(page_url, ua, ip, None, referer)
                    referer = page_url
                    for html in self.html_list:
                        if self.is_not_turn_the_page(first,
                                                     html,
                                                     page_num=j,
                                                     asin=asin):
                            return self.html_list, self.url_list, cookie, self.is_error_list

        return self.html_list, self.url_list, self.cookies, self.is_error_list