def JinDongCommentCases(url_list): prod_url = url_list[2] prod_topic = url_list[1] prod_id = re.findall('(\d+)', prod_url)[0] print("开始获取商品 " + prod_topic + ':' + str(prod_id) + '评论信息') # 保存评论数据集合 rates_list = [] # 评论网址第一页 rate_urls = [ # 默认评论接口 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv44&productId={0}&score=0&sortType=6&page={1}&pageSize=10&isShadowSku=0&fold=1', # 参考价值不大接口 'https://club.jd.com/comment/getProductPageFoldComments.action?callback=jQuery1719501&productId={0}&score=4&sortType=5&page={1}&pageSize=5&_=1573096406813' ] # 1.获取连接 # 请求头 headers = { 'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', "cookie": '__jdu=1150543271; shshshfpa=0cb162de-cb82-21b8-49a7-7e1fd26a3efd-1570864191; user-key=d5809892-c823-402e-9748-c84b2469d56f; cn=0; shshshfpb=eTsoprn6f4hkN00S8LggPuQ%3D%3D; unpl=V2_ZzNtbRYAS0Z8WkQAehlVB2JQRl0SUUcVd1oTAC8YVFIyV0BYclRCFX0URlVnG10UZwYZWEtcRx1FCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHsZWARjBhBeRFdzJXI4dmR%2bG1gDbwIiXHJWc1chVEVSexlcDSoDEllDU0YXdg5GZHopXw%3d%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_ef9b8c3e01834be1a7513cdee09fdec4|1572418139698; shshshfp=4ecb84897eabb0f7a4c6348b7cdc7d0a; __jda=122270672.1150543271.1570864187.1572825530.1573090824.9; __jdc=122270672; areaId=12; ipLoc-djd=12-984-3384-0; wlfstk_smdl=gcda47s1yytkclehvxho46m7ddz5g7ow; TrackID=1KNUUCIn3e7IMNektPzhbcu7wSO0kDr7PEe_KWvFCOXkJh4Zo6p9lf8KOj5iwp4Yidll4C9iAu7fQF6LVOjeB1LGNsaTdxOTqpshIt79InXGwUBG-R8JW8h4lpF-aMXFlBoc7nuE4YFFi_IXSENLUoA; thor=F5548B286F0AC84835F479E2098B937588592D856D78425D7FC38CD7238081AFCBA255023DFA3D8E13AF80EB0481FBDF4DA6C1A35102B43FEA63A3914094409E2250E5F462224217F1004694F9EC7CF2DA417BF181A528377DE99BED15AD4C25157B03BD7C98D6058B3B22E3F300B51E9F9A64987B3D551B14DCFF630D20CCBF954CBC1087415F2C2203531C10B881874F74CD45F930D0F4802E5F203320EEDE; pinId=eqbOg6AqvNqT4t6ZRIp7VrV9-x-f3wj7; pin=jd_5580681fb886d; unick=jd_181685ayj; ceshi3.com=103; _tp=OQVsjG6Pu5TIXKleFObW0uc7fxOqC8rImaa7i%2FLjfqM%3D; _pst=jd_5580681fb886d; shshshsID=d4ef035cd6502b3e3bbb5e5859bb09c1_2_1573090894262; __jdb=122270672.4.1150543271|9.1573090824; 3AB9D23F7A4B3C9B=4WQN5JCPKTD4EYGF7GGHYDUIBN64EH5SZHPCNA56CB2G7HP52UGN73YBUMQ2EOMZI4WXVSWB3CSTQT2KOLQIVGGV5A; JSESSIONID=99B9C173D8D05BABCE00F2429A497E26.s1', "referer": "{0}".format(prod_url), "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3732.400 QQBrowser/10.5.3819.400" } response = requests.get(rate_urls[1].format(prod_id, 0), headers=headers) rates_jsons = json.loads(re.findall('{.*}', response.text)[0]) sleep(3) # 获取总页数 pages = rates_jsons['productCommentSummary']['commentCount'] print("===============================") for rate_url in rate_urls: for page in range(0, int(pages / 10 + 1)): print("总共" + str(pages) + "条评论,正在获取第" + str(page) + "页") sleep(3) try: rates_responses = requests.get(rate_url.format(prod_id, page), headers=headers) rates = json.loads(re.findall('{.*}', rates_responses.text)[0]) except BaseException: print("无数据") break rates_lists = rates['comments'] for rate_list in rates_lists: rate = delSpecialChars(rate_list['content']) prod_color = rate_list['productColor'] prod_name = rate_list['referenceName'] rate_score = rate_list['score'] rate_dict = { 'add_time': now_time, 'prod_name': prod_name, 'rate_score': rate_score, 'rate': rate, 'prod_color': prod_color, 'prod_url': prod_url, 'prod_topic': prod_topic, 'prod_id': prod_id, 'sale_num': str(pages) } print(rate_dict) rates_list.append(rate_dict) if (rates_lists == []): break # 保存数据到文件 fileUtils().saveAsCsv(rates_list, './Data/Rates/{0}'.format(str(prod_id)))
driver.get(steelUrl) time.sleep(1) button = driver.find_element_by_link_text('{}日上海市场Cr系合结钢价格行情'.format(now_day)) real_url = button.get_attribute('href') print("爬取材料:"+origin_name+"网址为"+real_url) sleep(1) driver.get(real_url) sleep(1) xpath_date = etree.HTML(driver.page_source) # 数据更新日期 tr_list = xpath_date.xpath('//table[@id="marketTable"]/tbody/tr') for tr in tr_list: try: # origin_name = 'Cr系合结钢(40CrΦ20-28)' name = delSpecialChars(tr.xpath('./td[1]/a/text()')[0]) paihao = delSpecialChars(tr.xpath('./td[2]/text()')[0]) guige = delSpecialChars(tr.xpath('./td[4]/text()')[0]) area = delSpecialChars(tr.xpath('./td[5]/text()')[0]) real_name = name+'('+guige+')' if(origin_name == real_name and area=='杭钢' and paihao=='40Cr'): price = delSpecialChars(tr.xpath('./td[6]/text()')[0]) steelDict = {'name': real_name, 'area': area, 'date': now_time, 'price': price, 'add_time': now_time} print(steelDict) table.append(steelDict) except BaseException: print("error") continue print('{}日上海市场Cr系合结钢价格行情获取成功'.format(now_day)) print('===================================================')
def JDCarsInfo(car_name): # 产品信息集合 product_list = [] # selenium 参数 option = webdriver.ChromeOptions() option.add_argument('headless') option.add_argument('incognito') option.add_experimental_option('excludeSwitches', ['enable-automation']) option.add_experimental_option( "prefs", {"profile.managed_default_content_settings.images": 2}) driver = webdriver.Chrome( executable_path='D:\Maven\YadeaSpider\chromedriver.exe', chrome_options=option) # 请求的url url = 'https://search.jd.com/Search?keyword={0}&enc=utf-8'.format(car_name) # 请求连接 driver.get(url) sleep(2) html_doc = driver.page_source # 主动设置 bs4的解析库,BeautifulSoup对象 soup = BeautifulSoup(html_doc, 'lxml').prettify() # 获取页码(判断是否有页数) pages = etree.HTML(soup).xpath( '//*[@id="J_bottomPage"]/span[2]/em[1]/b/text()') if len(pages) == 0: pages = '1' else: pages = re.findall('\d+', pages[0])[0] print('*****开始爬取' + car_name + ':总共' + pages + '页*****') sleep(1) # 循环解析查询页数 for page in range(1, int(pages) + 1): # for page in range(5): url = 'https://search.jd.com/Search?keyword={0}&enc=utf-8&page={1}'.format( car_name, str(page * 2 - 1)) print('睡眠2s...开始爬取第' + str(page) + '页========url为' + url) driver.get(url) #sleep(2) # 调用xpath解析库 xpath_date = etree.HTML( BeautifulSoup(driver.page_source, 'lxml').prettify()) products = xpath_date.xpath('//*[@id="J_goodsList"]/ul/li') for product in products: # 产品信息 try: prod_store = delSpecialChars( product.xpath( './/a[@class="curr-shop hd-shopname"]/text()')[0]) prod_price = delSpecialChars( product.xpath('./div/div[3]/strong/i/text()')[0]) prod_price = prod_price[:-((prod_price.index('.')) - 1)] prod_url = 'https:' + delSpecialChars( product.xpath('./div/div[4]/a/@href')[0]) prod_id = re.findall('(\d+)', prod_url)[0] store_url = 'https:' + delSpecialChars( product.xpath('./div/div[7]/span/a/@href')[0]) store_id = re.findall('(\d+)', store_url)[0] store_rates = '评价' + delSpecialChars( product.xpath('./div/div[5]/strong/a/text()')[0]) # 跳转商品url,获取详细信息 driver.get(prod_url) prod_reponse = driver.page_source # sleep(1) selector = etree.HTML(prod_reponse) prod_name = delSpecialChars( selector.xpath('//div[@class="sku-name"]/text()')[0]) # 商品详细配置描述 shop_items = selector.xpath( '//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li') shop_list = [] for shop_item in shop_items: shop_list.append( delSpecialChars(shop_item.xpath('./text()')[0])) # 商店评分 store_score = delSpecialChars( selector.xpath( '//*[@id="crumb-wrap"]/div/div[2]/div[2]/div[2]/div/div/div/div/@title' )[0]) prod_dict = { 'prod_store': prod_store, 'prod_price': prod_price, 'prod_url': prod_url, 'store_url': store_url, 'store_rates': store_rates, 'prod_name': prod_name, 'shop_list': str(shop_list), 'store_score': store_score, 'add_time': now_time, 'prod_id': prod_id, 'store_id': store_id } if (int(prod_price) < 1000): continue product_list.append(prod_dict) print("产品信息:" + str(prod_dict)) except BaseException: print('商品信息不符合条件') # 关闭driver driver.quit() return product_list
def TMCarsInfo(car_name): #产品信息集合 product_list = [] # selenium 参数 option = webdriver.ChromeOptions() option.add_argument('headless') option.add_argument('incognito') driver = webdriver.Chrome( executable_path='D:\Maven\YadeaSpider\chromedriver.exe', chrome_options=option) # 请求的url url = 'https://list.tmall.com/search_product.htm?q={0}'.format(car_name) # 请求连接 driver.get(url) sleep(2) html_doc = driver.page_source # 主动设置 bs4的解析库,BeautifulSoup对象 soup = BeautifulSoup(html_doc, 'lxml').prettify() # 获取页码 pages = re.findall('共(\d+)页', soup)[0] print('*****开始爬取' + car_name + ':总共' + pages + '页*****') # 循环解析查询页数 for page in range(1, int(pages) + 1): url = 'https://list.tmall.com/search_product.htm?q={0}&totalPage={1}&jumpto={2}'.format( car_name, pages, page) print('睡眠5s...开始爬取第' + str(page) + '页========url为' + url) driver.get(url) sleep(5) # 调用xpath解析库 xpath_date = etree.HTML( BeautifulSoup(driver.page_source, 'lxml').prettify()) products = xpath_date.xpath('//*[@id="J_ItemList"]/div') # 获取产品信息 for product in products: prod_url = 'https:' + product.xpath('./div/div[1]/a/@href')[0] prod_id = re.findall('\?id=(\d+)', prod_url)[0] prod_price = product.xpath('./div/p[1]/em/@title')[0] prod_price = prod_price[:-((prod_price.index('.')) - 1)] prod_title = product.xpath('./div/p[2]/a/@title')[0] prod_store = product.xpath('./div/p[3]/span[3]/@data-nick')[0] prod_msale = delSpecialChars( product.xpath('./div/p[3]/span[1]/text()') [0]) + delSpecialChars( product.xpath('./div/p[3]/span[1]/em/text()')[0]) prod_rates = delSpecialChars( product.xpath('./div/p[3]/span[2]/text()') [0]) + delSpecialChars( product.xpath('./div/p[3]/span[2]/a/text()')[0]) store_url = 'https:' + product.xpath('./div/div[2]/a/@href')[0] store_id = re.findall('\?user_number_id=(\d+)', store_url)[0] # 跳转商品url,获取详细信息 driver.get(prod_url) sleep(1) prod_reponse = BeautifulSoup(driver.page_source, 'lxml').prettify() selector = etree.HTML(prod_reponse) shop_items = selector.xpath('//*[@id="J_AttrUL"]/li') # 商品详细配置描述 shop_list = [] for shop_item in shop_items: shop_list.append( delSpecialChars(shop_item.xpath('./text()')[0])) # 店铺评分 score_list = [] scores = selector.xpath('//*[@id="shop-info"]/div[2]/div') for score in scores: res_score = delSpecialChars( score.xpath('./div[1]/text()')[0]) + delSpecialChars( score.xpath('./div[2]/span/text()')[0]) score_list.append(res_score) # 收藏人气 popularity = '/html/body/div[5]/div/div[2]/div/div[1]/div[2]/p/span[2]' # 保存明细 prod_dict = { 'car_name': car_name, 'add_time': now_time, 'prod_url': prod_url, 'prod_title': prod_title, 'store_url': store_url, 'prod_price': prod_price, 'prod_store': prod_store, 'prod_msale': prod_msale, 'prod_rates': prod_rates, 'prod_id': prod_id, 'store_id': store_id, 'shop_list': str(shop_list), 'score_list': str(score_list) } try: if (int(prod_price) >= 1000): product_list.append(prod_dict) except ValueError: print('不符合条件的商品') print("产品信息:" + str(prod_dict)) # 关闭driver driver.quit() return product_list
def TMProductInfos(topic_list): option = webdriver.ChromeOptions() option.add_argument('--proxy--server=127.0.0.1:8080') # 防止机器识别 option.add_experimental_option('excludeSwitches', ['enable-automation']) # 不加载图片,加快访问速度 option.add_experimental_option( "prefs", {"profile.managed_default_content_settings.images": 2}) # option.add_argument('headless') # 要换成适应自己操作系统的chromedriver driver = webdriver.Chrome( executable_path='D:\Maven\YadeaSpider\chromedriver.exe', chrome_options=option) # 登陆淘宝 login_url = 'https://login.taobao.com/member/login.jhtml' driver.get(login_url) # 等待 密码登录选项 出现 password_login = driver.find_element_by_xpath( '//div[@class="login-links"]/a[@class="forget-pwd J_Quick2Static"]') password_login.click() # 等待 微博登录选项 出现 weibo_login = driver.find_element_by_xpath('//a[@class="weibo-login"]') weibo_login.click() # 等待 微博账号 出现 weibo_user = WebDriverWait( driver, timeout).until(lambda d: d.find_element_by_xpath( '//div[@id="pl_login_logged"]/div/div[2]/div/input')) weibo_user.send_keys('18168546559') sleep(1) # 等待 微博密码 出现 weibo_pwd = WebDriverWait( driver, timeout).until(lambda d: d.find_element_by_xpath( '//div[@id="pl_login_logged"]/div/div[3]/div/input')) weibo_pwd.send_keys('zj123!') # 等待 登录按钮 出现 submit = WebDriverWait( driver, timeout).until(lambda d: d.find_element_by_xpath( '//div[@id="pl_login_logged"]/div/div[7]/div[1]/a/span')) submit.click() sleep(10) for topic in topic_list: print("=====开始爬取" + topic + "商品信息=====") #产品信息集合 products_lists = [] # 搜索主页(切换天猫) driver.get('https://www.tmall.com/') # 搜索天猫 search_input = WebDriverWait( driver, timeout ).until(lambda d: d.find_element_by_xpath( '/html/body/div[1]/div[2]/div/div/div/div[2]/form/fieldset/div/div/div/input' )) search_input.send_keys(topic + '电动车') submit = WebDriverWait( driver, timeout ).until(lambda d: d.find_element_by_xpath( '/html/body/div[1]/div[2]/div/div/div/div[2]/form/fieldset/div/button' )) submit.click() sleep(2) # 请求地址获取连接 xpath_date = etree.HTML(driver.page_source) # 获取当前搜素商品总页数 page_sum = xpath_date.xpath( '//div[@class="ui-page-wrap"]/b[2]/form/input[3]/@value')[0] print("=====当前搜索总共" + page_sum + "页=====") # 循环获取每页的部分数据 for index in range(0, int(page_sum)): print("=====开始获取第" + str(index + 1) + "页商品数据=====") # 获取每个商品的xpath集合 xpath_dates = etree.HTML(driver.page_source) product_lists = xpath_dates.xpath('//div[@id="J_ItemList"]/div') option.add_argument('headless') driver2 = webdriver.Chrome( executable_path='D:\Maven\YadeaSpider\chromedriver.exe', chrome_options=option) # 获取每个商品属性 for product_list in product_lists: # 商品属性 product_price = product_list.xpath( './div/p[@class="productPrice"]/em/@title')[0] product_price = product_price[:-( (product_price.index('.')) - 1)].replace('.', '') if (product_price == ''): print("价格小于1000,非整车,跳出循环") continue if (int(product_price) < 1000): print("价格小于1000,非整车,跳出循环") continue product_title = product_list.xpath( './div/p[@class="productTitle"]/a/@title')[0] product_url = 'http:' + product_list.xpath( './div/p[@class="productTitle"]/a/@href')[0] product_id = re.findall( '//detail.tmall.com/item.htm\?id=(\d+)&', product_url)[0] shop_url = 'http:' + product_list.xpath( './div/div[@class="productShop"]/a[@class="productShop-name"]/@href' )[0] shop_id = re.findall('user_number_id=(\d+)&', shop_url)[0] shop_name = product_list.xpath( './div/p[@class="productStatus"]/span[3]/@data-nick')[0] month_sale = product_list.xpath( './div/p[@class="productStatus"]/span/em/text()')[0] comment_sum = product_list.xpath( './div/p[@class="productStatus"]/span[2]/a/text()')[0] comment_url = 'https:' + product_list.xpath( './div/p[@class="productStatus"]/span[2]/a/@href')[0] # 跳转商品url,获取详细信息 driver2.get(product_url) selector = etree.HTML(driver2.page_source) shop_items = selector.xpath('//*[@id="J_AttrUL"]/li') # 商品详细配置描述 shop_list = [] for shop_item in shop_items: shop_list.append( delSpecialChars(shop_item.xpath('./text()')[0])) # 店铺评分 score_list = [] scores = selector.xpath('//*[@id="shop-info"]/div[2]/div') for score in scores: res_score = delSpecialChars( score.xpath('./div[1]/text()')[0]) + delSpecialChars( score.xpath('./div[2]/span/text()')[0]) score_list.append(res_score) # 保存字典表 prduct_dict = { 'product_title': product_title, 'product_id': product_id, 'product_url': product_url, 'product_price': product_price, 'month_sale': month_sale, "shop_name": shop_name, "shop_url": shop_url, "shop_id": shop_id, "comment_sum": comment_sum, 'add_time': add_time, 'topic': topic, 'shop_list': str(shop_list), 'score_list': str(score_list), 'comment_url': comment_url } print(prduct_dict) products_lists.append(prduct_dict) # sleep(1) try: # 跳转下一页 next_button = WebDriverWait( driver, timeout ).until(lambda d: d.find_element_by_xpath( '//div[@class="ui-page"]/div/b/a[@class="ui-page-next"]')) next_button.click() except BaseException: print("没有下一页,跳出循环") break sleep(3) driver2.quit() fileUtils().saveAsCsv(products_lists, './Data/Product/{0}'.format(topic)) driver.quit()
driver, timeout).until(lambda d: d.find_element_by_xpath( '//div[@id="pl_login_logged"]/div/div[3]/div/input')) weibo_pwd.send_keys('zj123!') # 等待 登录按钮 出现 submit = WebDriverWait( driver, timeout).until(lambda d: d.find_element_by_xpath( '//div[@id="pl_login_logged"]/div/div[7]/div[1]/a/span')) submit.click() sleep(10) url = 'https://detail.tmall.com/item.htm?id=596816395443&skuId=4315774262712&areaId=320200&user_id=2817130358&cat_id=2&is_b=1&rn=46d9826dde3d0497b455413d55cd753e&on_comment=1' driver.get(url) # 等待评论数据渲染 selector = etree.HTML(driver.page_source) sleep(10) rate_list = selector.xpath('//div[@id="J_Reviews"]/div/div[6]/table/tbody/tr') for rate in rate_list: print(delSpecialChars(rate.xpath('./td[1]/div[1]/div[1]/text()')[0])) print("====================") # 点击下一页 # next_button = WebDriverWait(driver, timeout).until( # lambda d: d.find_element_by_xpath('/html/body/div[5]/div/div[4]/div/div[1]/div/div[10]/div[1]/div/div[7]/div/a[3]')) # next_button.click() print( selector.xpath( '/html/body/div[5]/div/div[4]/div/div[1]/div/div[10]/div[1]/div/div[7]/div/a[3]/text()' ))