コード例 #1
0
 def _get_byb_merchant(self, html_code=None):
     if html_code is None:
         return '', ''
     parser = GoodsParser(html_code)
     sname = parser._get_seller_name()
     seller_id = parser._get_seller_id()
     print('_get_byb_merchant.sname', sname)
     print('_get_byb_merchant.seller_id', seller_id)
     if not sname:
         seller_id = ''
     return sname, seller_id
コード例 #2
0
 def make_url(asin, cid=0, url_type='goods', urltitle='', sessionId=''):
     url_tuple = tuple()
     if url_type == 'goods':
         url, referer = GoodsParser.make_goods_url(asin,
                                                   urltitle=urltitle,
                                                   sessionId=sessionId)
         url_tuple = (url, referer)
     if url_type == 'reviews':
         url = ReviewsParser.make_reviews_url(asin, urltitle=urltitle)
         url_tuple = (url, )
     if url_type == 'tosell':
         url = TosellParser.make_tosell_url(asin)
         url_tuple = (url, )
     if url_type == 'keyword':
         url = GoodsParser.make_search_url(asin, cid)
         url_tuple = (url, )
     return url_tuple
コード例 #3
0
def get_brand_from_parser(asin):
    # 根据asin获取url
    url, refer = BaseCrawler.make_url(asin)
    # 调用下载器方法获取页面数据
    html_code = get_data_from_requests(url)
    # 调用商品解析的方法获取页面的品牌信息
    print(222)
    brand = GoodsParser(html_code)._get_brand()
    return brand
コード例 #4
0
ファイル: BaseCrawler.py プロジェクト: xusu12/hs_code
 def parser(self,
            html,
            html_type='',
            asin='',
            ip='',
            url='',
            ua='',
            info_log=None,
            debug_log=None,
            monitor_type=0,
            cookie=None,
            tosellSum=None,
            goods_html_code=None):
     '''只写了商品、评论、跟卖, 评论的html要求是一个html_list其余模块根据需要覆写此方法'''
     result = ()
     is_error = False
     if html_type == 'goods':
         try:
             goods_datas, bsr_data = GoodsParser().parser_goods(
                 html,
                 asin,
                 monitor_type,
                 ip=ip,
                 ua=ua,
                 debug_log=debug_log,
                 download_url=url,
                 cookies=cookie)
             result = (goods_datas, bsr_data)
         except Exception as e:
             is_error = True
             self.debug_log.error('[%s] goods parser解析 [%s] 时 [%s]' %
                                  (ip, url, e))
     if html_type == 'reviews':
         try:
             reviews_datas = ReviewsParser().reviews_parser(
                 html, asin, ip=ip, download_url=url)
             result = (reviews_datas, )
         except Exception as e:
             is_error = True
             self.debug_log.error('[%s] reviews parser解析 [%s] 时 [%s]' %
                                  (ip, url, e))
     if html_type == 'tosell':
         try:
             tosell_info = TosellParser().tosell_parser(
                 html,
                 asin,
                 tosellSum,
                 ip=ip,
                 download_url=url,
                 goods_html_code=goods_html_code)
             result = (tosell_info, )
         except Exception as e:
             is_error = True
             self.debug_log.error('[%s] tosell parser解析 [%s] 时 [%s]' %
                                  (ip, url, e))
     return result, is_error
コード例 #5
0
ファイル: brandPatch.py プロジェクト: xusu12/hs_code
def get_brand_from_parser(asin, debug_log):
    # 根据asin获取url
    urls = BaseCrawler.make_url(asin)
    url = urls[0]
    refer = urls[1]
    # 获取ua
    ua = UaPond.get_new_ua()
    cookies = ''
    ip = ''
    ipQ = ''
    # 调用下载器方法获取页面数据
    html_data, cookie, is_error = get_html_useRequest(url,
                                                      ua,
                                                      ip,
                                                      cookies,
                                                      debug_log,
                                                      refer,
                                                      ipQ,
                                                      url_type='goods',
                                                      asin=asin)
    # 调用商品解析的方法获取页面的品牌信息
    brand = GoodsParser(html_data)._get_brand()
    return brand
コード例 #6
0
 def get_to_sell_price(html_code=None):
     return GoodsParser(html_code)._to_price(html_code)
コード例 #7
0
 def get_to_sell_sum(html_code=None):
     return GoodsParser(html_code)._to_sell(html_code)
コード例 #8
0
ファイル: rrg_test.py プロジェクト: xusu12/hs_code
from Crawler.goodsParser import GoodsParser

goods = GoodsParser()
with open('1.html', 'r', encoding='utf8') as f:
    html = f.read()
goods.parser_goods(html, 'the_asin', '')
count = goods._get_review_count(html_code=html)
code = goods._get_review_rating(html_code=html)
print(count)
print(code)
コード例 #9
0
ファイル: goodsCrawler.py プロジェクト: xusu12/hs_code
    def download(self, ip, asin_or_kw, url_dict):
        url_type = self.url_type
        asin = asin_or_kw
        monitor_type = url_dict.get('mtp') or 1
        print('url type: ', url_type)
        url_md5key = url_dict.get('md5') or ''
        if not url_md5key:
            url_md5key = self.get_md5_key(asin + url_type)
        startTime = return_PST().strftime("%Y-%m-%d %H:%M:%S")
        time_now = lambda: time.time()
        time1 = time_now()
        ua = self.get_ua()
        self.debug_log.debug('goods ua: %s' % (ua))
        # value_str = ip + ua
        # self.debug_log.debug('goods value_str: %s' % (value_str))
        # cookMd5key = self.get_md5_key(value_str)
        cookMd5key = ''
        cookie = self.get_cookie(cookMd5key)
        print('\ngoodsCookie: ', cookie)
        # url_title = urlQ.get_urlTitle_from_string(asin)
        url_title = ''
        sessionId = ''
        # if cookie:
        #     sessionId = cookie.get('session-id')
        retry = False
        old_dnum = url_dict.get('dnum') or 0
        if old_dnum > 3:
            retry = True
        url, referer = GoodsParser.make_goods_url(asin,
                                                  urltitle=url_title,
                                                  sessionId=sessionId,
                                                  retry=retry)
        cookies = cookie
        print('goods referer: %s' % (referer))
        print('[ip %s] 工作中... [%s]' % (ip, url))
        if url:
            print('goods_url: ', url)
            html, cookiesObj, is_error = self.get_html(url,
                                                       ua,
                                                       ip,
                                                       cookies,
                                                       referer,
                                                       url_type=url_type,
                                                       asin=asin)
            print('这是HTML')
            # with open('data/devtest/6_1.html', 'w', encoding='utf8') as f:
            #     f.write(html)
            durl = url_dict.get('durl') or []
            durl.append(url)
            url_dict['durl'] = list(set(durl))
            url_dict['dnum'] = old_dnum + 1
            if is_error:
                self.the_url_is_discard(asin, url_dict, url_type, url_md5key)
                msgInt = 6
                proxyInfo = 'get Html error'
                self.record_log(asin, time1, msgInt, url_type, startTime, ip,
                                proxyInfo)

            else:
                analyze = self.analyze_html(html,
                                            cookie,
                                            cookiesObj,
                                            ip,
                                            asin_or_kw,
                                            url_dict,
                                            cookMd5key,
                                            time1,
                                            startTime,
                                            html_type=url_type)
                if analyze and analyze != 404:
                    # 获取url_title 并保存
                    # self.save_url_title(asin, html)
                    cook = cookie
                    if not cookie:
                        cook = cookiesObj
                    result, is_error = self.parser(html,
                                                   html_type=url_type,
                                                   asin=asin,
                                                   ip=ip,
                                                   ua=ua,
                                                   debug_log=self.debug_log,
                                                   monitor_type=monitor_type,
                                                   cookie=cook,
                                                   url=url)
                    if is_error:
                        self.the_url_is_discard(asin, url_dict, url_type,
                                                url_md5key)
                        msgInt = 3
                        proxyInfo = 'get data error'
                        self.record_log(asin, time1, msgInt, url_type,
                                        startTime, ip, proxyInfo)
                    else:
                        if not result:
                            self.the_url_is_discard(asin, url_dict, url_type,
                                                    url_md5key)
                            msgInt = 2
                            proxyInfo = 'get data defeated'
                            self.record_log(asin, time1, msgInt, url_type,
                                            startTime, ip, proxyInfo)
                        else:
                            goods_datas = result[0]
                            if goods_datas:
                                print('goods_datas:', goods_datas)
                                qty, qtydt = inv_start(asin, html, cook, ua,
                                                       goods_datas[asin])
                                print('qty, qtydt:', qty, qtydt)
                                goods_datas[asin]['quantity'] = qty
                                goods_datas[asin]['qtydt'] = qtydt
                                print('goods_datas.add(qty, qtydt):',
                                      goods_datas)
                                bsr_data = result[1]
                                data_bytes = pickle.dumps(goods_datas)
                                if bsr_data:
                                    # print('bsr_data1', bsr_data)
                                    bsrData_bytes = pickle.dumps(bsr_data)
                                    # print('bsrData_bytes', bsrData_bytes)
                                    self.dataQ.add_bsrData_to_queue(
                                        bsrData_bytes)
                                    # print('bsr data ok', bsr_data)
                                # from pprint import pprint
                                # pprint(goods_datas)
                                result1 = self.dataQ.add_goods_data_to_queue(
                                    data_bytes)
                                print(result1)
                                if not result1:
                                    sys.exit()

                                # self.dataQ.record_data_ok_times()
                                # self.dataQ.record_goods_ok_times()
                                self.save_success_asin_keyword(
                                    asin, url_type=url_type)
                                msgInt = 1
                                proxyInfo = 'get data success'
                                self.record_log(asin, time1, msgInt, url_type,
                                                startTime, ip, proxyInfo)

                else:
                    self.the_url_is_discard(asin, url_dict, url_type,
                                            url_md5key)
                    time.sleep(1)
        else:
            print(url_type, '没有url')
            self.the_url_is_discard(asin, url_dict, url_type, url_md5key)
            time.sleep(1)
コード例 #10
0
ファイル: goodsCrawler.py プロジェクト: xusu12/hs_code
 def save_url_title(self, asin, html):
     urlTitle = GoodsParser.get_urltitle(asin, html)
     if urlTitle and len(urlTitle) <= 72:
         urlQ.add_urlTitle_to_string(asin, urlTitle)
コード例 #11
0
ファイル: BaseCrawler.py プロジェクト: xusu12/hs_code
 def make_search_url(self, kw, cid):
     search_url = GoodsParser.make_search_url(kw, cid)
     return search_url
コード例 #12
0
ファイル: BaseCrawler.py プロジェクト: xusu12/hs_code
 def is_page_not_found(self, html):
     return GoodsParser.is_page_not_found(html)
コード例 #13
0
ファイル: BaseCrawler.py プロジェクト: xusu12/hs_code
 def is_RobotCheck(self, html):
     return GoodsParser.is_RobotCheck(html)
コード例 #14
0
ファイル: goodsCrawler.py プロジェクト: xusu12/hs_code
    def download(self, asin_or_kw, url_dict):
        print(asin_or_kw, url_dict)
        url_type = self.url_type
        print(url_type)
        asin = asin_or_kw
        monitor_type = url_dict.get('mtp') or 1
        print('url type: ', url_type)
        url_md5key = url_dict.get('md5') or ''
        if not url_md5key:
            url_md5key = self.get_md5_key(asin + url_type)
        startTime = return_PST().strftime("%Y-%m-%d %H:%M:%S")
        time_now = lambda: time.time()
        time1 = time_now()
        retry = False
        old_dnum = url_dict.get('dnum') or 0
        if old_dnum > 3:
            retry = True
        url, referer = GoodsParser.make_goods_url(asin, retry=retry)
        if url:
            print('goods_url: ', url)
            html, cookiesObj, is_error = self.get_html(url,
                                                       referer=referer,
                                                       url_type=url_type,
                                                       asin=asin)
            print('is_error:', is_error)
            durl = url_dict.get('durl') or []
            durl.append(url)
            url_dict['durl'] = list(set(durl))
            url_dict['dnum'] = old_dnum + 1
            if is_error:
                self.the_url_is_discard(asin, url_dict, url_type, url_md5key)
                msgInt = 6
                proxyInfo = 'get Html error'
                self.record_log(asin, time1, msgInt, url_type, startTime,
                                proxyInfo)

            else:
                analyze = self.analyze_html(html,
                                            asin_or_kw,
                                            url_dict,
                                            time1,
                                            startTime,
                                            html_type=url_type)
                if analyze and analyze != 404:
                    result, is_error = self.parser(html,
                                                   html_type=url_type,
                                                   asin=asin,
                                                   debug_log=self.debug_log,
                                                   monitor_type=monitor_type,
                                                   url=url)
                    if is_error:
                        self.the_url_is_discard(asin, url_dict, url_type,
                                                url_md5key)
                        msgInt = 3
                        proxyInfo = 'get data error'
                        self.record_log(asin, time1, msgInt, url_type,
                                        startTime, proxyInfo)
                    else:
                        if not result:
                            self.the_url_is_discard(asin, url_dict, url_type,
                                                    url_md5key)
                            msgInt = 2
                            proxyInfo = 'get data defeated'
                            self.record_log(asin, time1, msgInt, url_type,
                                            startTime, proxyInfo)
                        else:
                            goods_datas = result[0]
                            if goods_datas:
                                cookies, headers = cookiesObj
                                user_anget = headers.get('User-Agent')
                                print(user_anget)
                                from pprint import pprint
                                pprint(cookies)
                                pprint(goods_datas)
                                msgInt = 1
                                proxyInfo = 'get data success'
                                log_param = (asin, time1, msgInt, url_type,
                                             startTime, proxyInfo)
                                start(asin=asin,
                                      goods_datas=goods_datas,
                                      user_anget=user_anget,
                                      url_dict=url_dict,
                                      goods_html=html,
                                      cookies=cookies,
                                      log_param=log_param,
                                      crawler_obj=self)
                else:
                    self.the_url_is_discard(asin, url_dict, url_type,
                                            url_md5key)
                    time.sleep(1)
        else:
            print(url_type, '没有url')
            self.the_url_is_discard(asin, url_dict, url_type, url_md5key)
            time.sleep(1)