Ejemplo n.º 1
0
def get_jd_comments(browser: Chrome, jd_ss: Union[Shop, JDSku], get_sku: bool = False,
                    sku_mode: bool = False, summary: bool = False):
    max_page = 141
    while max_page > 0:
        try:
            # 获取当前页面的评论
            if sku_mode is True:
                jd_comments_url = 'skuProductPageComments'
            else:
                jd_comments_url = 'productPageComments'
            jd_comments = get_response_body(browser, jd_comments_url, 'GET')
            if jd_comments is None:
                print('---未找到评论接口数据---')
                break
            jd_comments = jd_comments.lstrip('fetchJSON_comment98(').rstrip(');')
            jd_comments = json.loads(jd_comments)
            # 保存评论
            comment_list = jd_comments['comments']
            insert_jd_comments(comment_list, jd_ss)
            if len(comment_list) == 0:
                print('该页评论数据0条')
                break
            # 遍历评论中的所有SKU
            if get_sku is True:
                get_sku_from_jd_comments(comment_list, jd_ss)
        except WebDriverException:
            print('---此页评论数据获取异常(WebDriverException), 跳过此分类---')
            break
        # 赋值最大页数
        if max_page == 141:
            max_page = jd_comments['maxPage']
            if sku_mode and summary:
                sku_summary = jd_comments['productCommentSummary']
                first_comment = comment_list[0]
                insert_jd_model_summary(sku_summary, first_comment, jd_ss)
            elif summary is True:
                total_summary = jd_comments['productCommentSummary']
                insert_jd_comment_summary(total_summary, jd_ss)
        # 最后一页就不下滑了
        max_page -= 1
        print(f'本轮剩余页数: {max_page}')
        if max_page == 0:
            break
        # 下滑点击下一页
        while True:
            try:
                WebDriverWait(browser, 0.5).until(
                    ec.element_to_be_clickable((By.CLASS_NAME, 'ui-pager-next'))
                )
                browser.execute_script('document.getElementsByClassName("ui-pager-next")[0].click()')
                waiting_content_loading(browser, 'comment-item')
                break
            except TimeoutException:
                window_scroll_by(browser, 200)

    back_to_first_window(browser)
    print('------当前浏览器窗口已关闭, 暂停10秒------')
    sleep(10)
def turn_to_the_next_page(browser: Chrome):
    while True:
        try:
            WebDriverWait(browser, 0.5).until(
                ec.element_to_be_clickable((By.CLASS_NAME, 'more')))
            browser.execute_script('document.querySelector(".more").click()')
            waiting_content_loading(browser, 'common')
            break
        except TimeoutException:
            window_scroll_by(browser, 500)
def turn_to_the_next_page(browser: Chrome):
    while True:
        try:
            WebDriverWait(browser, 0.5).until(
                ec.element_to_be_clickable(
                    (By.CSS_SELECTOR,
                     'li.m-pagination-item:nth-child(8) > a:nth-child(1)')))
            js_script = 'document.querySelector("li.m-pagination-item:nth-child(8) > a:nth-child(1)").click()'
            browser.execute_script(js_script)
            waiting_content_loading(browser, 'commentItem')
            break
        except TimeoutException:
            window_scroll_by(browser, 500)
def insert_jd_all_target_sku(browser: Chrome):
    max_page = 141
    current_page = 0
    while current_page <= max_page:
        # 获取最大页数和当前页数
        mp_path = '/html/body/div[7]/div/div[2]/div[1]/div/div[1]/div[1]/div[3]/span/i'
        cp_path = '/html/body/div[7]/div/div[2]/div[1]/div/div[1]/div[1]/div[3]/span/b'
        max_page = int(browser.find_element_by_xpath(mp_path).text)
        current_page = int(browser.find_element_by_xpath(cp_path).text)
        print(f'总页数: {max_page}, 当前页数: {current_page}')

        # 下滑半页使页面加载后30个商品 (lazy-loading机制)
        window_scroll_by(browser, 3200)
        sleep(3)
        # 保存将要获取的当前页面的商品SKU编号
        insert_jd_target_sku(browser)

        # 翻页
        if current_page == max_page:
            break
        else:
            turn_to_the_next_page(browser)
Ejemplo n.º 5
0
def insert_sn_all_target_sku(browser: Chrome):
    max_page = 141
    current_page = 0
    while current_page <= max_page:
        # 获取最大页数和当前页数
        mp_path = '#second-filter > div > div.second-page.clearfix > span'
        cp_path = '#second-filter > div > div.second-page.clearfix > span > em'
        max_page = int(
            re.sub(r'^\d+?/', '',
                   browser.find_element_by_css_selector(mp_path).text))
        current_page = int(browser.find_element_by_css_selector(cp_path).text)
        print(f'总页数: {max_page}, 当前页数: {current_page}')

        # 下滑半页使页面加载后30个商品 (lazy-loading机制)
        window_scroll_by(browser, 3600)
        sleep(3)
        # 保存将要获取的当前页面的商品SKU编号
        insert_sn_target_sku(browser)

        # 翻页
        if current_page == max_page:
            break
        else:
            turn_to_the_next_page(browser)
def get_sn_comments(browser: Chrome, sn_ss: Union[Shop, SNSku], sku_mode: bool = False):
    page = 1
    while True:
        try:
            # 获取当前页面的评论
            if sku_mode is True and page == 1:
                sn_comments = {}
                sn_model_summary = {}
                target_urls = [
                    {'url': 'cluster_review_lists/general', 'method': 'GET'},
                    {'url': 'review_count/general', 'method': 'GET'}
                ]
                all_data = get_response_body_list(browser, target_urls)
                for data in all_data:
                    if data['url'] == 'cluster_review_lists/general' and data['method'] == 'GET':
                        sn_comments = data['response_body']
                        sn_comments = sn_comments.lstrip('reviewList(').rstrip(')')
                        sn_comments = json.loads(sn_comments)
                    if data['url'] == 'review_count/general' and data['method'] == 'GET':
                        sn_model_summary = data['response_body']
                        sn_model_summary = sn_model_summary.lstrip('satisfy(').rstrip(')')
                        sn_model_summary = json.loads(sn_model_summary)
                if sn_comments['returnMsg'] == '无评价数据':
                    print('---无评价数据, 跳过此SKU---')
                    break
                else:
                    if sn_model_summary['returnMsg'] == '查询数量成功':
                        insert_sn_model_summary(sn_model_summary['reviewCounts'][0],
                                                sn_comments['commodityReviews'][0]['commodityInfo'], sn_ss)
                    else:
                        print('---查询当前SKU评论统计数量失败---')
            else:
                if sku_mode is False:
                    sn_comments_url = 'cluster_review_lists/cluster'
                else:
                    sn_comments_url = 'cluster_review_lists/general'
                sn_comments = get_response_body(browser, sn_comments_url, 'GET')
                sn_comments = sn_comments.lstrip('reviewList(').rstrip(')')
                sn_comments = json.loads(sn_comments)

            # 保存评论
            if sn_comments['returnMsg'] == '成功取得评价列表':
                comment_list = sn_comments['commodityReviews']
                insert_sn_comments(comment_list, sn_ss)
            else:
                # 最大页数为50页, 小于50页时需要打印出异常情况
                if page <= 50:
                    print(f'---获取第{page}页评论数据异常---')
                break
        except (WebDriverException, AttributeError, TypeError):
            print(f'---获取第{page}页评论数据异常, 跳过此轮---')
            break

        print(f'当前页数: {page}')
        # 下滑点击下一页
        while True:
            try:
                WebDriverWait(browser, 0.5).until(
                    ec.element_to_be_clickable((By.CSS_SELECTOR, '.next.rv-maidian'))
                )
                browser.execute_script('document.getElementsByClassName("next rv-maidian")[0].click()')
                waiting_content_loading(browser, 'rv-target-item')
                break
            except TimeoutException:
                window_scroll_by(browser, 500)

        page += 1

    back_to_first_window(browser)
    print('------当前浏览器窗口已关闭, 暂停10秒------')
    sleep(10)
def insert_jd_all_commodity(browser: Chrome):
    for target_sku in TargetSku.select().where(TargetSku.source == '京东'):
        # 获取当前商品SKU编号
        sku: str = target_sku.sku
        # 检查当前SKU是否在数据库中保存的SKU中, 避免销量重复计数
        result = ExistedSku.get_or_none(ExistedSku.source == '京东',
                                        ExistedSku.sku == sku)
        if result is not None:
            # 删除已经保存的商品target_sku
            delete_saved_commodity_sku(sku)
            print(f'---SKU编号为 {sku} 的商品信息已保存过---')
            continue

        # 开始抓取商品信息
        commodity = Commodity()
        commodity.source = '京东'
        commodity.url = 'https://item.jd.com/' + sku + '.html'

        # 打开并切换到当前商品页面
        switch_to_current_sku_page(browser, commodity.url)
        # 从后端API接口获取并保存已上架的SKU
        get_jd_sku_from_api(browser, sku)

        try:
            commodity.price = float(
                browser.find_element_by_css_selector(
                    'span.price:nth-child(2)').text)
        except (ValueError, NoSuchElementException):
            # 价格显示为待发布时或商品以下柜时, 抛出异常
            commodity.price = -1

        try:
            commodity.title = browser.find_element_by_class_name(
                'sku-name').text.strip()
        except NoSuchElementException:
            commodity.title = '无商品标题'

        try:
            total_str = browser.find_element_by_css_selector(
                '#comment-count > a').text
            commodity.total = parse_jd_count_str(total_str)
        except NoSuchElementException:
            # 商品为预约状态时销量不显示在价格旁边, 抛出异常
            commodity.total = -1

        # 判断是否为京东自营
        try:
            self_str = browser.find_element_by_class_name('u-jd').text
            if self_str == '自营':
                self = True
            else:
                self = False
        except NoSuchElementException:
            self = False
        commodity.is_self = self

        try:
            commodity.shop_name = browser.find_element_by_css_selector(
                '#crumb-wrap > div > div.contact.fr.clearfix > div.J-hove-wrap.EDropdown.fr > div:nth-child(1) > div '
                '> a').text
        except NoSuchElementException:
            commodity.shop_name = '店铺名称为空'

        # 从商品介绍中获取商品信息
        try:
            commodity.brand = browser.find_element_by_css_selector(
                '#parameter-brand > li > a').text
        except NoSuchElementException:
            commodity.brand = '品牌未注明'

        intro = browser.find_elements_by_css_selector('.parameter2 > li')
        intro_list = []
        for i in intro:
            intro_list.append(i.text)
        # 预赋值, 防止注入空置报错
        commodity.os = '页面未注明'
        commodity.model = '页面未注明'
        for intro_item in intro_list:
            if '操作系统' in intro_item:
                commodity.os = intro_item.replace('操作系统:', '')
            if 'CPU型号' in intro_item:
                commodity.soc_model = intro_item.replace('CPU型号:', '')
            if '商品名称' in intro_item:
                commodity.model = intro_item.replace('商品名称:', '')

        # 下滑点击 规格与包装 选项
        window_scroll_by(browser, 1200)
        js_script = 'document.querySelector("#detail > div.tab-main.large > ul > li:nth-child(2)").click()'
        browser.execute_script(js_script)
        sleep(1)

        # 从 规格与包装 中获取商品信息
        spec_list = browser.find_elements_by_class_name('Ptable-item')
        for spec_item in spec_list:
            spec_item_title = spec_item.find_element_by_tag_name('h3').text
            item_list = spec_item.find_elements_by_class_name('clearfix')
            if '主体' == spec_item_title:
                for item in item_list:
                    item_name = item.find_element_by_tag_name('dt').text
                    item_value = item.find_element_by_tag_name('dd').text
                    if '产品名称' == item_name:
                        commodity.model = item_value
            if '基本信息' == spec_item_title:
                for item in item_list:
                    item_name = item.find_element_by_tag_name('dt').text
                    item_value = item.find_element_by_tag_name('dd').text.replace('mm', '').replace('MM', '')\
                        .replace('mM', '').replace('Mm', '').replace('g', '').replace('G', '').replace('约', '')\
                        .replace('大约', '').replace('左右', '').replace('大概', '').strip()
                    try:
                        if '机身宽度' in item_name:
                            commodity.width = float(item_value)
                        if '机身厚度' in item_name:
                            commodity.thickness = float(item_value)
                        if '机身长度' in item_name:
                            commodity.length = float(item_value)
                        if '机身重量' in item_name:
                            commodity.weight = float(item_value)
                    except ValueError:
                        pass
            if '主芯片' == spec_item_title:
                for item in item_list:
                    item_name = item.find_element_by_tag_name('dt').text
                    item_value = item.find_element_by_tag_name('dd').text
                    if 'CPU品牌' == item_name:
                        commodity.soc_mfrs = item_value
            if '屏幕' == spec_item_title:
                for item in item_list:
                    item_name = item.find_element_by_tag_name('dt').text
                    item_value_str = item.find_element_by_tag_name('dd').text
                    if '主屏幕尺寸' in item_name:
                        try:
                            item_value = float(
                                item_value_str.replace('英寸', '').strip())
                            commodity.screen_size = item_value
                        except ValueError:
                            pass
        # 保存商品信息
        commodity.save()
        # 删除已经保存的商品target_sku
        delete_saved_commodity_sku(sku)
        print(f'------SKU编号为 {sku} 的商品信息保存完毕------')
        # 回到手机分类页面
        back_to_first_window(browser)
def switch_to_youpin_default_comments_page(browser: Chrome):
    window_scroll_by(browser, 800)
    browser.execute_script(
        'document.querySelector("li.info-nav-item:nth-child(2)").click()')
    waiting_content_loading(browser, 'commentItem')
def insert_sn_all_commodity(browser: Chrome):
    for target_sku in SNTargetSku.select():
        # 获取当前商品店铺代码和SKU编号
        shop_code: str = target_sku.shop_code
        sku: str = target_sku.sku
        # 检查当前SKU是否在数据库中保存的SKU中, 避免销量重复计数
        result = SNExistedSku.get_or_none(SNExistedSku.shop_code == shop_code,
                                          SNExistedSku.sku == sku)
        if result is not None:
            # 删除已经保存的商品target_sku
            delete_saved_commodity_sku(shop_code, sku)
            print(f'---SKU编号为 {sku} 的商品信息已保存过---')
            continue

        # 开始抓取商品信息
        commodity = Commodity()
        commodity.source = '苏宁'
        commodity.url = 'https://product.suning.com/' + shop_code + '/' + sku + '.html'

        # 打开并切换到当前商品页面
        switch_to_current_sku_page(browser, commodity.url)
        # 从后端API接口获取所有SKU和销量
        commodity.total = get_sn_sku_and_total_from_api(
            browser, shop_code, sku)

        # 判断是否为苏宁自营, 苏宁自营的店铺码为 0000000000
        if int(shop_code) == 0:
            commodity.is_self = True
        else:
            commodity.is_self = False

        try:
            commodity.title = browser.find_element_by_id(
                'itemDisplayName').text
        except NoSuchElementException:
            commodity.title = '无商品标题'

        try:
            commodity.price = float(
                browser.find_element_by_class_name('mainprice').text.replace(
                    '¥', ''))
        except (ValueError, NoSuchElementException):
            commodity.price = -2

        try:
            commodity.shop_name = browser.find_element_by_class_name(
                'header-shop-name').text
        except NoSuchElementException:
            commodity.shop_name = '店铺名称为空'

        # 从商品介绍中获取商品信息
        # 预赋值, 防止注入空置报错
        commodity.brand = '页面未注明'
        commodity.model = '页面未注明'
        commodity.os = '页面未注明'

        intro_list = browser.find_elements_by_css_selector(
            '#phoneParameters > ul > li')
        for intro in intro_list:
            intro_title = intro.find_element_by_tag_name('p').text
            items = intro.find_elements_by_css_selector(
                'dl > dd > div > ul > li')
            if intro_title == '屏幕':
                for item in items:
                    if '屏幕尺寸' in item.text:
                        commodity.screen_size = float(
                            item.text.replace('屏幕尺寸:', '').replace('英寸',
                                                                   '').strip())
            if intro_title == 'CPU':
                for item in items:
                    if 'CPU型号' in item.text:
                        commodity.soc_model = item.text.replace('CPU型号:', '')

        # 下滑点击 包装及参数 选项
        window_scroll_by(browser, 1500)
        browser.execute_script(
            'document.querySelector("#productParTitle > a").click()')
        sleep(1)

        # 从 规格与包装 中获取商品信息
        spec_list = browser.find_elements_by_css_selector(
            '#itemParameter > tbody > tr')
        for spec in spec_list:
            if spec.get_attribute('parametercode') is not None:
                spec_name = spec.find_element_by_tag_name('span').text
                spec_value = spec.find_element_by_class_name('val').text
                if spec_name == '品牌':
                    commodity.brand = spec_value
                if spec_name == '型号':
                    commodity.model = spec_value
                if spec_name == '手机操作系统':
                    commodity.os = spec_value
                if spec_name == 'CPU品牌':
                    commodity.soc_mfrs = spec_value
                if spec_name == 'CPU型号':
                    commodity.soc_model = spec_value
                try:
                    spec_val = spec_value.replace('mm', '').replace('MM', '').replace('毫米', '').replace('英寸', '')\
                        .replace('mM', '').replace('Mm', '').replace('g', '').replace('G', '').replace('约', '')\
                        .replace('大约', '').replace('左右', '').replace('克', '').replace('寸', '').strip()
                    if spec_name == '屏幕尺寸':
                        commodity.screen_size = float(spec_val)
                    if spec_name == '机身长度':
                        commodity.length = float(spec_val)
                    if spec_name == '机身宽度':
                        commodity.width = float(spec_val)
                    if spec_name == '机身厚度':
                        commodity.thickness = float(spec_val)
                    if spec_name == '重量':
                        commodity.weight = float(spec_val)
                except ValueError:
                    pass

        # 保存商品信息
        commodity.save()
        # 删除已经保存的商品target_sku
        delete_saved_commodity_sku(shop_code, sku)
        print(f'------SKU编号为 {sku} 的商品信息保存完毕------')
        # 回到手机分类页面
        back_to_first_window(browser)
        sleep(2)