Exemple #1
0
def JinDongCommentCases(url_list):
    prod_url = url_list[2]
    prod_topic = url_list[1]
    prod_id = re.findall('(\d+)', prod_url)[0]
    print("开始获取商品  " + prod_topic + ':' + str(prod_id) + '评论信息')

    # 保存评论数据集合
    rates_list = []

    # 评论网址第一页
    rate_urls = [
        # 默认评论接口
        'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv44&productId={0}&score=0&sortType=6&page={1}&pageSize=10&isShadowSku=0&fold=1',
        # 参考价值不大接口
        'https://club.jd.com/comment/getProductPageFoldComments.action?callback=jQuery1719501&productId={0}&score=4&sortType=5&page={1}&pageSize=5&_=1573096406813'
    ]

    # 1.获取连接
    # 请求头
    headers = {
        'accept':
        '*/*',
        'accept-encoding':
        'gzip, deflate, br',
        'accept-language':
        'zh-CN,zh;q=0.9',
        "cookie":
        '__jdu=1150543271; shshshfpa=0cb162de-cb82-21b8-49a7-7e1fd26a3efd-1570864191; user-key=d5809892-c823-402e-9748-c84b2469d56f; cn=0; shshshfpb=eTsoprn6f4hkN00S8LggPuQ%3D%3D; unpl=V2_ZzNtbRYAS0Z8WkQAehlVB2JQRl0SUUcVd1oTAC8YVFIyV0BYclRCFX0URlVnG10UZwYZWEtcRx1FCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHsZWARjBhBeRFdzJXI4dmR%2bG1gDbwIiXHJWc1chVEVSexlcDSoDEllDU0YXdg5GZHopXw%3d%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_ef9b8c3e01834be1a7513cdee09fdec4|1572418139698; shshshfp=4ecb84897eabb0f7a4c6348b7cdc7d0a; __jda=122270672.1150543271.1570864187.1572825530.1573090824.9; __jdc=122270672; areaId=12; ipLoc-djd=12-984-3384-0; wlfstk_smdl=gcda47s1yytkclehvxho46m7ddz5g7ow; TrackID=1KNUUCIn3e7IMNektPzhbcu7wSO0kDr7PEe_KWvFCOXkJh4Zo6p9lf8KOj5iwp4Yidll4C9iAu7fQF6LVOjeB1LGNsaTdxOTqpshIt79InXGwUBG-R8JW8h4lpF-aMXFlBoc7nuE4YFFi_IXSENLUoA; thor=F5548B286F0AC84835F479E2098B937588592D856D78425D7FC38CD7238081AFCBA255023DFA3D8E13AF80EB0481FBDF4DA6C1A35102B43FEA63A3914094409E2250E5F462224217F1004694F9EC7CF2DA417BF181A528377DE99BED15AD4C25157B03BD7C98D6058B3B22E3F300B51E9F9A64987B3D551B14DCFF630D20CCBF954CBC1087415F2C2203531C10B881874F74CD45F930D0F4802E5F203320EEDE; pinId=eqbOg6AqvNqT4t6ZRIp7VrV9-x-f3wj7; pin=jd_5580681fb886d; unick=jd_181685ayj; ceshi3.com=103; _tp=OQVsjG6Pu5TIXKleFObW0uc7fxOqC8rImaa7i%2FLjfqM%3D; _pst=jd_5580681fb886d; shshshsID=d4ef035cd6502b3e3bbb5e5859bb09c1_2_1573090894262; __jdb=122270672.4.1150543271|9.1573090824; 3AB9D23F7A4B3C9B=4WQN5JCPKTD4EYGF7GGHYDUIBN64EH5SZHPCNA56CB2G7HP52UGN73YBUMQ2EOMZI4WXVSWB3CSTQT2KOLQIVGGV5A; JSESSIONID=99B9C173D8D05BABCE00F2429A497E26.s1',
        "referer":
        "{0}".format(prod_url),
        "user-agent":
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3732.400 QQBrowser/10.5.3819.400"
    }
    response = requests.get(rate_urls[1].format(prod_id, 0), headers=headers)
    rates_jsons = json.loads(re.findall('{.*}', response.text)[0])
    sleep(3)
    # 获取总页数
    pages = rates_jsons['productCommentSummary']['commentCount']
    print("===============================")
    for rate_url in rate_urls:
        for page in range(0, int(pages / 10 + 1)):
            print("总共" + str(pages) + "条评论,正在获取第" + str(page) + "页")
            sleep(3)
            try:
                rates_responses = requests.get(rate_url.format(prod_id, page),
                                               headers=headers)
                rates = json.loads(re.findall('{.*}', rates_responses.text)[0])
            except BaseException:
                print("无数据")
                break

            rates_lists = rates['comments']
            for rate_list in rates_lists:
                rate = delSpecialChars(rate_list['content'])
                prod_color = rate_list['productColor']
                prod_name = rate_list['referenceName']
                rate_score = rate_list['score']
                rate_dict = {
                    'add_time': now_time,
                    'prod_name': prod_name,
                    'rate_score': rate_score,
                    'rate': rate,
                    'prod_color': prod_color,
                    'prod_url': prod_url,
                    'prod_topic': prod_topic,
                    'prod_id': prod_id,
                    'sale_num': str(pages)
                }
                print(rate_dict)
                rates_list.append(rate_dict)
            if (rates_lists == []):
                break

    # 保存数据到文件
    fileUtils().saveAsCsv(rates_list, './Data/Rates/{0}'.format(str(prod_id)))
Exemple #2
0
driver.get(steelUrl)
time.sleep(1)
button = driver.find_element_by_link_text('{}日上海市场Cr系合结钢价格行情'.format(now_day))
real_url = button.get_attribute('href')

print("爬取材料:"+origin_name+"网址为"+real_url)
sleep(1)
driver.get(real_url)
sleep(1)
xpath_date = etree.HTML(driver.page_source)
# 数据更新日期
tr_list = xpath_date.xpath('//table[@id="marketTable"]/tbody/tr')
for tr in tr_list:
    try:
        # origin_name = 'Cr系合结钢(40CrΦ20-28)'
        name = delSpecialChars(tr.xpath('./td[1]/a/text()')[0])
        paihao = delSpecialChars(tr.xpath('./td[2]/text()')[0])
        guige = delSpecialChars(tr.xpath('./td[4]/text()')[0])
        area = delSpecialChars(tr.xpath('./td[5]/text()')[0])
        real_name = name+'('+guige+')'
        if(origin_name == real_name and area=='杭钢' and paihao=='40Cr'):
            price = delSpecialChars(tr.xpath('./td[6]/text()')[0])
            steelDict = {'name': real_name, 'area': area, 'date': now_time, 'price': price, 'add_time': now_time}
            print(steelDict)
            table.append(steelDict)
    except BaseException:
        print("error")
        continue
print('{}日上海市场Cr系合结钢价格行情获取成功'.format(now_day))
print('===================================================')
def JDCarsInfo(car_name):
    # 产品信息集合
    product_list = []
    # selenium 参数
    option = webdriver.ChromeOptions()
    option.add_argument('headless')
    option.add_argument('incognito')
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    option.add_experimental_option(
        "prefs", {"profile.managed_default_content_settings.images": 2})
    driver = webdriver.Chrome(
        executable_path='D:\Maven\YadeaSpider\chromedriver.exe',
        chrome_options=option)
    # 请求的url
    url = 'https://search.jd.com/Search?keyword={0}&enc=utf-8'.format(car_name)
    # 请求连接
    driver.get(url)
    sleep(2)
    html_doc = driver.page_source
    # 主动设置 bs4的解析库,BeautifulSoup对象
    soup = BeautifulSoup(html_doc, 'lxml').prettify()
    # 获取页码(判断是否有页数)
    pages = etree.HTML(soup).xpath(
        '//*[@id="J_bottomPage"]/span[2]/em[1]/b/text()')
    if len(pages) == 0:
        pages = '1'
    else:
        pages = re.findall('\d+', pages[0])[0]
    print('*****开始爬取' + car_name + ':总共' + pages + '页*****')
    sleep(1)
    # 循环解析查询页数
    for page in range(1, int(pages) + 1):

        # for page in range(5):
        url = 'https://search.jd.com/Search?keyword={0}&enc=utf-8&page={1}'.format(
            car_name, str(page * 2 - 1))
        print('睡眠2s...开始爬取第' + str(page) + '页========url为' + url)
        driver.get(url)
        #sleep(2)
        # 调用xpath解析库
        xpath_date = etree.HTML(
            BeautifulSoup(driver.page_source, 'lxml').prettify())
        products = xpath_date.xpath('//*[@id="J_goodsList"]/ul/li')
        for product in products:
            # 产品信息
            try:
                prod_store = delSpecialChars(
                    product.xpath(
                        './/a[@class="curr-shop hd-shopname"]/text()')[0])
                prod_price = delSpecialChars(
                    product.xpath('./div/div[3]/strong/i/text()')[0])
                prod_price = prod_price[:-((prod_price.index('.')) - 1)]
                prod_url = 'https:' + delSpecialChars(
                    product.xpath('./div/div[4]/a/@href')[0])
                prod_id = re.findall('(\d+)', prod_url)[0]
                store_url = 'https:' + delSpecialChars(
                    product.xpath('./div/div[7]/span/a/@href')[0])
                store_id = re.findall('(\d+)', store_url)[0]
                store_rates = '评价' + delSpecialChars(
                    product.xpath('./div/div[5]/strong/a/text()')[0])

                # 跳转商品url,获取详细信息
                driver.get(prod_url)
                prod_reponse = driver.page_source
                # sleep(1)
                selector = etree.HTML(prod_reponse)
                prod_name = delSpecialChars(
                    selector.xpath('//div[@class="sku-name"]/text()')[0])

                # 商品详细配置描述
                shop_items = selector.xpath(
                    '//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li')
                shop_list = []
                for shop_item in shop_items:
                    shop_list.append(
                        delSpecialChars(shop_item.xpath('./text()')[0]))
                # 商店评分
                store_score = delSpecialChars(
                    selector.xpath(
                        '//*[@id="crumb-wrap"]/div/div[2]/div[2]/div[2]/div/div/div/div/@title'
                    )[0])

                prod_dict = {
                    'prod_store': prod_store,
                    'prod_price': prod_price,
                    'prod_url': prod_url,
                    'store_url': store_url,
                    'store_rates': store_rates,
                    'prod_name': prod_name,
                    'shop_list': str(shop_list),
                    'store_score': store_score,
                    'add_time': now_time,
                    'prod_id': prod_id,
                    'store_id': store_id
                }
                if (int(prod_price) < 1000):
                    continue
                product_list.append(prod_dict)
                print("产品信息:" + str(prod_dict))
            except BaseException:
                print('商品信息不符合条件')

    # 关闭driver
    driver.quit()
    return product_list
def TMCarsInfo(car_name):
    #产品信息集合
    product_list = []
    # selenium 参数
    option = webdriver.ChromeOptions()
    option.add_argument('headless')
    option.add_argument('incognito')
    driver = webdriver.Chrome(
        executable_path='D:\Maven\YadeaSpider\chromedriver.exe',
        chrome_options=option)
    # 请求的url
    url = 'https://list.tmall.com/search_product.htm?q={0}'.format(car_name)
    # 请求连接
    driver.get(url)
    sleep(2)
    html_doc = driver.page_source
    # 主动设置 bs4的解析库,BeautifulSoup对象
    soup = BeautifulSoup(html_doc, 'lxml').prettify()
    # 获取页码
    pages = re.findall('共(\d+)页', soup)[0]
    print('*****开始爬取' + car_name + ':总共' + pages + '页*****')
    # 循环解析查询页数
    for page in range(1, int(pages) + 1):
        url = 'https://list.tmall.com/search_product.htm?q={0}&totalPage={1}&jumpto={2}'.format(
            car_name, pages, page)
        print('睡眠5s...开始爬取第' + str(page) + '页========url为' + url)
        driver.get(url)
        sleep(5)
        # 调用xpath解析库
        xpath_date = etree.HTML(
            BeautifulSoup(driver.page_source, 'lxml').prettify())
        products = xpath_date.xpath('//*[@id="J_ItemList"]/div')
        # 获取产品信息
        for product in products:
            prod_url = 'https:' + product.xpath('./div/div[1]/a/@href')[0]
            prod_id = re.findall('\?id=(\d+)', prod_url)[0]
            prod_price = product.xpath('./div/p[1]/em/@title')[0]
            prod_price = prod_price[:-((prod_price.index('.')) - 1)]
            prod_title = product.xpath('./div/p[2]/a/@title')[0]
            prod_store = product.xpath('./div/p[3]/span[3]/@data-nick')[0]
            prod_msale = delSpecialChars(
                product.xpath('./div/p[3]/span[1]/text()')
                [0]) + delSpecialChars(
                    product.xpath('./div/p[3]/span[1]/em/text()')[0])
            prod_rates = delSpecialChars(
                product.xpath('./div/p[3]/span[2]/text()')
                [0]) + delSpecialChars(
                    product.xpath('./div/p[3]/span[2]/a/text()')[0])
            store_url = 'https:' + product.xpath('./div/div[2]/a/@href')[0]
            store_id = re.findall('\?user_number_id=(\d+)', store_url)[0]

            # 跳转商品url,获取详细信息
            driver.get(prod_url)
            sleep(1)
            prod_reponse = BeautifulSoup(driver.page_source, 'lxml').prettify()
            selector = etree.HTML(prod_reponse)
            shop_items = selector.xpath('//*[@id="J_AttrUL"]/li')
            # 商品详细配置描述
            shop_list = []
            for shop_item in shop_items:
                shop_list.append(
                    delSpecialChars(shop_item.xpath('./text()')[0]))
            # 店铺评分
            score_list = []
            scores = selector.xpath('//*[@id="shop-info"]/div[2]/div')
            for score in scores:
                res_score = delSpecialChars(
                    score.xpath('./div[1]/text()')[0]) + delSpecialChars(
                        score.xpath('./div[2]/span/text()')[0])
                score_list.append(res_score)
            # 收藏人气
            popularity = '/html/body/div[5]/div/div[2]/div/div[1]/div[2]/p/span[2]'
            # 保存明细
            prod_dict = {
                'car_name': car_name,
                'add_time': now_time,
                'prod_url': prod_url,
                'prod_title': prod_title,
                'store_url': store_url,
                'prod_price': prod_price,
                'prod_store': prod_store,
                'prod_msale': prod_msale,
                'prod_rates': prod_rates,
                'prod_id': prod_id,
                'store_id': store_id,
                'shop_list': str(shop_list),
                'score_list': str(score_list)
            }
            try:
                if (int(prod_price) >= 1000):
                    product_list.append(prod_dict)
            except ValueError:
                print('不符合条件的商品')
            print("产品信息:" + str(prod_dict))

    # 关闭driver
    driver.quit()
    return product_list
def TMProductInfos(topic_list):
    option = webdriver.ChromeOptions()
    option.add_argument('--proxy--server=127.0.0.1:8080')
    # 防止机器识别
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    # 不加载图片,加快访问速度
    option.add_experimental_option(
        "prefs", {"profile.managed_default_content_settings.images": 2})
    # option.add_argument('headless')

    # 要换成适应自己操作系统的chromedriver
    driver = webdriver.Chrome(
        executable_path='D:\Maven\YadeaSpider\chromedriver.exe',
        chrome_options=option)
    # 登陆淘宝
    login_url = 'https://login.taobao.com/member/login.jhtml'
    driver.get(login_url)
    # 等待 密码登录选项 出现
    password_login = driver.find_element_by_xpath(
        '//div[@class="login-links"]/a[@class="forget-pwd J_Quick2Static"]')
    password_login.click()
    # 等待 微博登录选项 出现
    weibo_login = driver.find_element_by_xpath('//a[@class="weibo-login"]')
    weibo_login.click()
    # 等待 微博账号 出现
    weibo_user = WebDriverWait(
        driver, timeout).until(lambda d: d.find_element_by_xpath(
            '//div[@id="pl_login_logged"]/div/div[2]/div/input'))
    weibo_user.send_keys('18168546559')
    sleep(1)
    # 等待 微博密码 出现
    weibo_pwd = WebDriverWait(
        driver, timeout).until(lambda d: d.find_element_by_xpath(
            '//div[@id="pl_login_logged"]/div/div[3]/div/input'))
    weibo_pwd.send_keys('zj123!')
    # 等待 登录按钮 出现
    submit = WebDriverWait(
        driver, timeout).until(lambda d: d.find_element_by_xpath(
            '//div[@id="pl_login_logged"]/div/div[7]/div[1]/a/span'))
    submit.click()
    sleep(10)

    for topic in topic_list:
        print("=====开始爬取" + topic + "商品信息=====")
        #产品信息集合
        products_lists = []
        # 搜索主页(切换天猫)
        driver.get('https://www.tmall.com/')
        # 搜索天猫
        search_input = WebDriverWait(
            driver, timeout
        ).until(lambda d: d.find_element_by_xpath(
            '/html/body/div[1]/div[2]/div/div/div/div[2]/form/fieldset/div/div/div/input'
        ))
        search_input.send_keys(topic + '电动车')
        submit = WebDriverWait(
            driver, timeout
        ).until(lambda d: d.find_element_by_xpath(
            '/html/body/div[1]/div[2]/div/div/div/div[2]/form/fieldset/div/button'
        ))
        submit.click()
        sleep(2)
        # 请求地址获取连接
        xpath_date = etree.HTML(driver.page_source)
        # 获取当前搜素商品总页数
        page_sum = xpath_date.xpath(
            '//div[@class="ui-page-wrap"]/b[2]/form/input[3]/@value')[0]
        print("=====当前搜索总共" + page_sum + "页=====")
        # 循环获取每页的部分数据
        for index in range(0, int(page_sum)):
            print("=====开始获取第" + str(index + 1) + "页商品数据=====")
            # 获取每个商品的xpath集合
            xpath_dates = etree.HTML(driver.page_source)
            product_lists = xpath_dates.xpath('//div[@id="J_ItemList"]/div')
            option.add_argument('headless')
            driver2 = webdriver.Chrome(
                executable_path='D:\Maven\YadeaSpider\chromedriver.exe',
                chrome_options=option)
            # 获取每个商品属性
            for product_list in product_lists:
                # 商品属性
                product_price = product_list.xpath(
                    './div/p[@class="productPrice"]/em/@title')[0]
                product_price = product_price[:-(
                    (product_price.index('.')) - 1)].replace('.', '')
                if (product_price == ''):
                    print("价格小于1000,非整车,跳出循环")
                    continue
                if (int(product_price) < 1000):
                    print("价格小于1000,非整车,跳出循环")
                    continue
                product_title = product_list.xpath(
                    './div/p[@class="productTitle"]/a/@title')[0]
                product_url = 'http:' + product_list.xpath(
                    './div/p[@class="productTitle"]/a/@href')[0]
                product_id = re.findall(
                    '//detail.tmall.com/item.htm\?id=(\d+)&', product_url)[0]
                shop_url = 'http:' + product_list.xpath(
                    './div/div[@class="productShop"]/a[@class="productShop-name"]/@href'
                )[0]
                shop_id = re.findall('user_number_id=(\d+)&', shop_url)[0]
                shop_name = product_list.xpath(
                    './div/p[@class="productStatus"]/span[3]/@data-nick')[0]
                month_sale = product_list.xpath(
                    './div/p[@class="productStatus"]/span/em/text()')[0]
                comment_sum = product_list.xpath(
                    './div/p[@class="productStatus"]/span[2]/a/text()')[0]
                comment_url = 'https:' + product_list.xpath(
                    './div/p[@class="productStatus"]/span[2]/a/@href')[0]

                # 跳转商品url,获取详细信息
                driver2.get(product_url)
                selector = etree.HTML(driver2.page_source)
                shop_items = selector.xpath('//*[@id="J_AttrUL"]/li')
                # 商品详细配置描述
                shop_list = []
                for shop_item in shop_items:
                    shop_list.append(
                        delSpecialChars(shop_item.xpath('./text()')[0]))
                # 店铺评分
                score_list = []
                scores = selector.xpath('//*[@id="shop-info"]/div[2]/div')
                for score in scores:
                    res_score = delSpecialChars(
                        score.xpath('./div[1]/text()')[0]) + delSpecialChars(
                            score.xpath('./div[2]/span/text()')[0])
                    score_list.append(res_score)

                # 保存字典表
                prduct_dict = {
                    'product_title': product_title,
                    'product_id': product_id,
                    'product_url': product_url,
                    'product_price': product_price,
                    'month_sale': month_sale,
                    "shop_name": shop_name,
                    "shop_url": shop_url,
                    "shop_id": shop_id,
                    "comment_sum": comment_sum,
                    'add_time': add_time,
                    'topic': topic,
                    'shop_list': str(shop_list),
                    'score_list': str(score_list),
                    'comment_url': comment_url
                }
                print(prduct_dict)
                products_lists.append(prduct_dict)
                # sleep(1)
            try:
                # 跳转下一页
                next_button = WebDriverWait(
                    driver, timeout
                ).until(lambda d: d.find_element_by_xpath(
                    '//div[@class="ui-page"]/div/b/a[@class="ui-page-next"]'))
                next_button.click()
            except BaseException:
                print("没有下一页,跳出循环")
                break
            sleep(3)
            driver2.quit()
        fileUtils().saveAsCsv(products_lists,
                              './Data/Product/{0}'.format(topic))
    driver.quit()
    driver, timeout).until(lambda d: d.find_element_by_xpath(
        '//div[@id="pl_login_logged"]/div/div[3]/div/input'))
weibo_pwd.send_keys('zj123!')
# 等待 登录按钮 出现
submit = WebDriverWait(
    driver, timeout).until(lambda d: d.find_element_by_xpath(
        '//div[@id="pl_login_logged"]/div/div[7]/div[1]/a/span'))
submit.click()
sleep(10)

url = 'https://detail.tmall.com/item.htm?id=596816395443&skuId=4315774262712&areaId=320200&user_id=2817130358&cat_id=2&is_b=1&rn=46d9826dde3d0497b455413d55cd753e&on_comment=1'
driver.get(url)
# 等待评论数据渲染
selector = etree.HTML(driver.page_source)
sleep(10)

rate_list = selector.xpath('//div[@id="J_Reviews"]/div/div[6]/table/tbody/tr')
for rate in rate_list:
    print(delSpecialChars(rate.xpath('./td[1]/div[1]/div[1]/text()')[0]))
    print("====================")

# 点击下一页
# next_button = WebDriverWait(driver, timeout).until(
#     lambda d: d.find_element_by_xpath('/html/body/div[5]/div/div[4]/div/div[1]/div/div[10]/div[1]/div/div[7]/div/a[3]'))
# next_button.click()

print(
    selector.xpath(
        '/html/body/div[5]/div/div[4]/div/div[1]/div/div[10]/div[1]/div/div[7]/div/a[3]/text()'
    ))