Example #1
0
def screen(url, select):
    headers = UserAgent.get_headers()  # 随机获取一个headers
    html = requests.get(url=url, headers=headers)
    html.encoding = 'gbk'  # 网站的编码
    html = html.text
    soup = BeautifulSoup(html, 'lxml')
    return soup.select(select)
Example #2
0
def spider_home():
    """
    获取物品的url以及标题价格
    :return: 返回物品编码
    """
    shop = input("请输入你要搜索的商品:")

    global headers

    headers = UserAgent.get_headers()  # 随机获取一个headers

    url = 'https://search.jd.com/Search?keyword={shop}&enc=utf-8&wq=%E5%B0%8F&pvid=469d5d51a3184cc9a053124dc020b31f'.format(
        shop=shop)

    try:
        r = requests.get(url=url, headers=headers).content

        content = le.HTML(r)

        href = content.xpath(
            '//*[@id="J_goodsList"]/ul/li[1]/div/div[1]/a/@href')

        price = content.xpath(
            '//*[@id="J_goodsList"]/ul/li[1]/div/div/strong/i/text()')

        title_01 = content.xpath(
            '//*[@id="J_goodsList"]/ul/li[1]/div/div/a/em/text()')

        title = [x.strip() for x in title_01
                 if x.strip() != '']  # 提取标题 将多余空格和\n去除

        re_01 = re.compile(r'\d+')

        number = re_01.findall(str(href))

        shop_price_01 = "".join(price)
        print("商品价格:" + shop_price_01)
        # for shop_price in price:
        #
        #     print("商品价格:" + shop_price)

        global shop_title  # 全局定义商品题目 进行文件改标题
        shop_title_01 = "".join(title)
        print("商品标题:" + shop_title_01)
        # for shop_title in title:
        #     print("商品标题:" + shop_title)

        for index in href:
            global href_shop
            href_shop = 'http:' + index
            print(href_shop)

        for num in number:
            # print(num)
            return num

    except:
        print('爬取失败')
Example #3
0
def screenPage(url, select):
    html = requests.get(url=url, headers=UserAgent.get_headers())
    html.encoding = 'gbk'
    html = html.text
    soup = BeautifulSoup(html, 'lxml')
    return soup.select(select)[0].next_sibling.text
Example #4
0
import scrapy
import json
import os
import time
import random
import jd_home1
import jd_main
import UserAgent

number = jd_home1.spider_home()  # 获取产品id
comment_file_path = '../../../id.txt'  # 文件存储位置
headers = UserAgent.get_headers()  # 随机获取headers表头


class JdHomeSpider(scrapy.Spider):
    name = 'jd_home'

    # allowed_domains = [' ']
    # start_urls = ['http:// /']

    def start_requests(self):
        number_page = 10
        try:
            for page in range(1, number_page):
                url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={number}' \
                      '&score=0&sortType=5&page={page}&pageSize=10&isShadowSku=0&fold=1'.format(
                    page=page,
                    number=number
                )
                yield scrapy.Request(
                    url=url,