def get_jd_comments(browser: Chrome, jd_ss: Union[Shop, JDSku], get_sku: bool = False, sku_mode: bool = False, summary: bool = False): max_page = 141 while max_page > 0: try: # 获取当前页面的评论 if sku_mode is True: jd_comments_url = 'skuProductPageComments' else: jd_comments_url = 'productPageComments' jd_comments = get_response_body(browser, jd_comments_url, 'GET') if jd_comments is None: print('---未找到评论接口数据---') break jd_comments = jd_comments.lstrip('fetchJSON_comment98(').rstrip(');') jd_comments = json.loads(jd_comments) # 保存评论 comment_list = jd_comments['comments'] insert_jd_comments(comment_list, jd_ss) if len(comment_list) == 0: print('该页评论数据0条') break # 遍历评论中的所有SKU if get_sku is True: get_sku_from_jd_comments(comment_list, jd_ss) except WebDriverException: print('---此页评论数据获取异常(WebDriverException), 跳过此分类---') break # 赋值最大页数 if max_page == 141: max_page = jd_comments['maxPage'] if sku_mode and summary: sku_summary = jd_comments['productCommentSummary'] first_comment = comment_list[0] insert_jd_model_summary(sku_summary, first_comment, jd_ss) elif summary is True: total_summary = jd_comments['productCommentSummary'] insert_jd_comment_summary(total_summary, jd_ss) # 最后一页就不下滑了 max_page -= 1 print(f'本轮剩余页数: {max_page}') if max_page == 0: break # 下滑点击下一页 while True: try: WebDriverWait(browser, 0.5).until( ec.element_to_be_clickable((By.CLASS_NAME, 'ui-pager-next')) ) browser.execute_script('document.getElementsByClassName("ui-pager-next")[0].click()') waiting_content_loading(browser, 'comment-item') break except TimeoutException: window_scroll_by(browser, 200) back_to_first_window(browser) print('------当前浏览器窗口已关闭, 暂停10秒------') sleep(10)
def turn_to_the_next_page(browser: Chrome): while True: try: WebDriverWait(browser, 0.5).until( ec.element_to_be_clickable((By.CLASS_NAME, 'more'))) browser.execute_script('document.querySelector(".more").click()') waiting_content_loading(browser, 'common') break except TimeoutException: window_scroll_by(browser, 500)
def turn_to_the_next_page(browser: Chrome): while True: try: WebDriverWait(browser, 0.5).until( ec.element_to_be_clickable( (By.CSS_SELECTOR, 'li.m-pagination-item:nth-child(8) > a:nth-child(1)'))) js_script = 'document.querySelector("li.m-pagination-item:nth-child(8) > a:nth-child(1)").click()' browser.execute_script(js_script) waiting_content_loading(browser, 'commentItem') break except TimeoutException: window_scroll_by(browser, 500)
def insert_jd_all_target_sku(browser: Chrome): max_page = 141 current_page = 0 while current_page <= max_page: # 获取最大页数和当前页数 mp_path = '/html/body/div[7]/div/div[2]/div[1]/div/div[1]/div[1]/div[3]/span/i' cp_path = '/html/body/div[7]/div/div[2]/div[1]/div/div[1]/div[1]/div[3]/span/b' max_page = int(browser.find_element_by_xpath(mp_path).text) current_page = int(browser.find_element_by_xpath(cp_path).text) print(f'总页数: {max_page}, 当前页数: {current_page}') # 下滑半页使页面加载后30个商品 (lazy-loading机制) window_scroll_by(browser, 3200) sleep(3) # 保存将要获取的当前页面的商品SKU编号 insert_jd_target_sku(browser) # 翻页 if current_page == max_page: break else: turn_to_the_next_page(browser)
def insert_sn_all_target_sku(browser: Chrome): max_page = 141 current_page = 0 while current_page <= max_page: # 获取最大页数和当前页数 mp_path = '#second-filter > div > div.second-page.clearfix > span' cp_path = '#second-filter > div > div.second-page.clearfix > span > em' max_page = int( re.sub(r'^\d+?/', '', browser.find_element_by_css_selector(mp_path).text)) current_page = int(browser.find_element_by_css_selector(cp_path).text) print(f'总页数: {max_page}, 当前页数: {current_page}') # 下滑半页使页面加载后30个商品 (lazy-loading机制) window_scroll_by(browser, 3600) sleep(3) # 保存将要获取的当前页面的商品SKU编号 insert_sn_target_sku(browser) # 翻页 if current_page == max_page: break else: turn_to_the_next_page(browser)
def get_sn_comments(browser: Chrome, sn_ss: Union[Shop, SNSku], sku_mode: bool = False): page = 1 while True: try: # 获取当前页面的评论 if sku_mode is True and page == 1: sn_comments = {} sn_model_summary = {} target_urls = [ {'url': 'cluster_review_lists/general', 'method': 'GET'}, {'url': 'review_count/general', 'method': 'GET'} ] all_data = get_response_body_list(browser, target_urls) for data in all_data: if data['url'] == 'cluster_review_lists/general' and data['method'] == 'GET': sn_comments = data['response_body'] sn_comments = sn_comments.lstrip('reviewList(').rstrip(')') sn_comments = json.loads(sn_comments) if data['url'] == 'review_count/general' and data['method'] == 'GET': sn_model_summary = data['response_body'] sn_model_summary = sn_model_summary.lstrip('satisfy(').rstrip(')') sn_model_summary = json.loads(sn_model_summary) if sn_comments['returnMsg'] == '无评价数据': print('---无评价数据, 跳过此SKU---') break else: if sn_model_summary['returnMsg'] == '查询数量成功': insert_sn_model_summary(sn_model_summary['reviewCounts'][0], sn_comments['commodityReviews'][0]['commodityInfo'], sn_ss) else: print('---查询当前SKU评论统计数量失败---') else: if sku_mode is False: sn_comments_url = 'cluster_review_lists/cluster' else: sn_comments_url = 'cluster_review_lists/general' sn_comments = get_response_body(browser, sn_comments_url, 'GET') sn_comments = sn_comments.lstrip('reviewList(').rstrip(')') sn_comments = json.loads(sn_comments) # 保存评论 if sn_comments['returnMsg'] == '成功取得评价列表': comment_list = sn_comments['commodityReviews'] insert_sn_comments(comment_list, sn_ss) else: # 最大页数为50页, 小于50页时需要打印出异常情况 if page <= 50: print(f'---获取第{page}页评论数据异常---') break except (WebDriverException, AttributeError, TypeError): print(f'---获取第{page}页评论数据异常, 跳过此轮---') break print(f'当前页数: {page}') # 下滑点击下一页 while True: try: WebDriverWait(browser, 0.5).until( ec.element_to_be_clickable((By.CSS_SELECTOR, '.next.rv-maidian')) ) browser.execute_script('document.getElementsByClassName("next rv-maidian")[0].click()') waiting_content_loading(browser, 'rv-target-item') break except TimeoutException: window_scroll_by(browser, 500) page += 1 back_to_first_window(browser) print('------当前浏览器窗口已关闭, 暂停10秒------') sleep(10)
def insert_jd_all_commodity(browser: Chrome): for target_sku in TargetSku.select().where(TargetSku.source == '京东'): # 获取当前商品SKU编号 sku: str = target_sku.sku # 检查当前SKU是否在数据库中保存的SKU中, 避免销量重复计数 result = ExistedSku.get_or_none(ExistedSku.source == '京东', ExistedSku.sku == sku) if result is not None: # 删除已经保存的商品target_sku delete_saved_commodity_sku(sku) print(f'---SKU编号为 {sku} 的商品信息已保存过---') continue # 开始抓取商品信息 commodity = Commodity() commodity.source = '京东' commodity.url = 'https://item.jd.com/' + sku + '.html' # 打开并切换到当前商品页面 switch_to_current_sku_page(browser, commodity.url) # 从后端API接口获取并保存已上架的SKU get_jd_sku_from_api(browser, sku) try: commodity.price = float( browser.find_element_by_css_selector( 'span.price:nth-child(2)').text) except (ValueError, NoSuchElementException): # 价格显示为待发布时或商品以下柜时, 抛出异常 commodity.price = -1 try: commodity.title = browser.find_element_by_class_name( 'sku-name').text.strip() except NoSuchElementException: commodity.title = '无商品标题' try: total_str = browser.find_element_by_css_selector( '#comment-count > a').text commodity.total = parse_jd_count_str(total_str) except NoSuchElementException: # 商品为预约状态时销量不显示在价格旁边, 抛出异常 commodity.total = -1 # 判断是否为京东自营 try: self_str = browser.find_element_by_class_name('u-jd').text if self_str == '自营': self = True else: self = False except NoSuchElementException: self = False commodity.is_self = self try: commodity.shop_name = browser.find_element_by_css_selector( '#crumb-wrap > div > div.contact.fr.clearfix > div.J-hove-wrap.EDropdown.fr > div:nth-child(1) > div ' '> a').text except NoSuchElementException: commodity.shop_name = '店铺名称为空' # 从商品介绍中获取商品信息 try: commodity.brand = browser.find_element_by_css_selector( '#parameter-brand > li > a').text except NoSuchElementException: commodity.brand = '品牌未注明' intro = browser.find_elements_by_css_selector('.parameter2 > li') intro_list = [] for i in intro: intro_list.append(i.text) # 预赋值, 防止注入空置报错 commodity.os = '页面未注明' commodity.model = '页面未注明' for intro_item in intro_list: if '操作系统' in intro_item: commodity.os = intro_item.replace('操作系统:', '') if 'CPU型号' in intro_item: commodity.soc_model = intro_item.replace('CPU型号:', '') if '商品名称' in intro_item: commodity.model = intro_item.replace('商品名称:', '') # 下滑点击 规格与包装 选项 window_scroll_by(browser, 1200) js_script = 'document.querySelector("#detail > div.tab-main.large > ul > li:nth-child(2)").click()' browser.execute_script(js_script) sleep(1) # 从 规格与包装 中获取商品信息 spec_list = browser.find_elements_by_class_name('Ptable-item') for spec_item in spec_list: spec_item_title = spec_item.find_element_by_tag_name('h3').text item_list = spec_item.find_elements_by_class_name('clearfix') if '主体' == spec_item_title: for item in item_list: item_name = item.find_element_by_tag_name('dt').text item_value = item.find_element_by_tag_name('dd').text if '产品名称' == item_name: commodity.model = item_value if '基本信息' == spec_item_title: for item in item_list: item_name = item.find_element_by_tag_name('dt').text item_value = item.find_element_by_tag_name('dd').text.replace('mm', '').replace('MM', '')\ .replace('mM', '').replace('Mm', '').replace('g', '').replace('G', '').replace('约', '')\ .replace('大约', '').replace('左右', '').replace('大概', '').strip() try: if '机身宽度' in item_name: commodity.width = float(item_value) if '机身厚度' in item_name: commodity.thickness = float(item_value) if '机身长度' in item_name: commodity.length = float(item_value) if '机身重量' in item_name: commodity.weight = float(item_value) except ValueError: pass if '主芯片' == spec_item_title: for item in item_list: item_name = item.find_element_by_tag_name('dt').text item_value = item.find_element_by_tag_name('dd').text if 'CPU品牌' == item_name: commodity.soc_mfrs = item_value if '屏幕' == spec_item_title: for item in item_list: item_name = item.find_element_by_tag_name('dt').text item_value_str = item.find_element_by_tag_name('dd').text if '主屏幕尺寸' in item_name: try: item_value = float( item_value_str.replace('英寸', '').strip()) commodity.screen_size = item_value except ValueError: pass # 保存商品信息 commodity.save() # 删除已经保存的商品target_sku delete_saved_commodity_sku(sku) print(f'------SKU编号为 {sku} 的商品信息保存完毕------') # 回到手机分类页面 back_to_first_window(browser)
def switch_to_youpin_default_comments_page(browser: Chrome): window_scroll_by(browser, 800) browser.execute_script( 'document.querySelector("li.info-nav-item:nth-child(2)").click()') waiting_content_loading(browser, 'commentItem')
def insert_sn_all_commodity(browser: Chrome): for target_sku in SNTargetSku.select(): # 获取当前商品店铺代码和SKU编号 shop_code: str = target_sku.shop_code sku: str = target_sku.sku # 检查当前SKU是否在数据库中保存的SKU中, 避免销量重复计数 result = SNExistedSku.get_or_none(SNExistedSku.shop_code == shop_code, SNExistedSku.sku == sku) if result is not None: # 删除已经保存的商品target_sku delete_saved_commodity_sku(shop_code, sku) print(f'---SKU编号为 {sku} 的商品信息已保存过---') continue # 开始抓取商品信息 commodity = Commodity() commodity.source = '苏宁' commodity.url = 'https://product.suning.com/' + shop_code + '/' + sku + '.html' # 打开并切换到当前商品页面 switch_to_current_sku_page(browser, commodity.url) # 从后端API接口获取所有SKU和销量 commodity.total = get_sn_sku_and_total_from_api( browser, shop_code, sku) # 判断是否为苏宁自营, 苏宁自营的店铺码为 0000000000 if int(shop_code) == 0: commodity.is_self = True else: commodity.is_self = False try: commodity.title = browser.find_element_by_id( 'itemDisplayName').text except NoSuchElementException: commodity.title = '无商品标题' try: commodity.price = float( browser.find_element_by_class_name('mainprice').text.replace( '¥', '')) except (ValueError, NoSuchElementException): commodity.price = -2 try: commodity.shop_name = browser.find_element_by_class_name( 'header-shop-name').text except NoSuchElementException: commodity.shop_name = '店铺名称为空' # 从商品介绍中获取商品信息 # 预赋值, 防止注入空置报错 commodity.brand = '页面未注明' commodity.model = '页面未注明' commodity.os = '页面未注明' intro_list = browser.find_elements_by_css_selector( '#phoneParameters > ul > li') for intro in intro_list: intro_title = intro.find_element_by_tag_name('p').text items = intro.find_elements_by_css_selector( 'dl > dd > div > ul > li') if intro_title == '屏幕': for item in items: if '屏幕尺寸' in item.text: commodity.screen_size = float( item.text.replace('屏幕尺寸:', '').replace('英寸', '').strip()) if intro_title == 'CPU': for item in items: if 'CPU型号' in item.text: commodity.soc_model = item.text.replace('CPU型号:', '') # 下滑点击 包装及参数 选项 window_scroll_by(browser, 1500) browser.execute_script( 'document.querySelector("#productParTitle > a").click()') sleep(1) # 从 规格与包装 中获取商品信息 spec_list = browser.find_elements_by_css_selector( '#itemParameter > tbody > tr') for spec in spec_list: if spec.get_attribute('parametercode') is not None: spec_name = spec.find_element_by_tag_name('span').text spec_value = spec.find_element_by_class_name('val').text if spec_name == '品牌': commodity.brand = spec_value if spec_name == '型号': commodity.model = spec_value if spec_name == '手机操作系统': commodity.os = spec_value if spec_name == 'CPU品牌': commodity.soc_mfrs = spec_value if spec_name == 'CPU型号': commodity.soc_model = spec_value try: spec_val = spec_value.replace('mm', '').replace('MM', '').replace('毫米', '').replace('英寸', '')\ .replace('mM', '').replace('Mm', '').replace('g', '').replace('G', '').replace('约', '')\ .replace('大约', '').replace('左右', '').replace('克', '').replace('寸', '').strip() if spec_name == '屏幕尺寸': commodity.screen_size = float(spec_val) if spec_name == '机身长度': commodity.length = float(spec_val) if spec_name == '机身宽度': commodity.width = float(spec_val) if spec_name == '机身厚度': commodity.thickness = float(spec_val) if spec_name == '重量': commodity.weight = float(spec_val) except ValueError: pass # 保存商品信息 commodity.save() # 删除已经保存的商品target_sku delete_saved_commodity_sku(shop_code, sku) print(f'------SKU编号为 {sku} 的商品信息保存完毕------') # 回到手机分类页面 back_to_first_window(browser) sleep(2)