def screen(url, select): headers = UserAgent.get_headers() # 随机获取一个headers html = requests.get(url=url, headers=headers) html.encoding = 'gbk' # 网站的编码 html = html.text soup = BeautifulSoup(html, 'lxml') return soup.select(select)
def spider_home(): """ 获取物品的url以及标题价格 :return: 返回物品编码 """ shop = input("请输入你要搜索的商品:") global headers headers = UserAgent.get_headers() # 随机获取一个headers url = 'https://search.jd.com/Search?keyword={shop}&enc=utf-8&wq=%E5%B0%8F&pvid=469d5d51a3184cc9a053124dc020b31f'.format( shop=shop) try: r = requests.get(url=url, headers=headers).content content = le.HTML(r) href = content.xpath( '//*[@id="J_goodsList"]/ul/li[1]/div/div[1]/a/@href') price = content.xpath( '//*[@id="J_goodsList"]/ul/li[1]/div/div/strong/i/text()') title_01 = content.xpath( '//*[@id="J_goodsList"]/ul/li[1]/div/div/a/em/text()') title = [x.strip() for x in title_01 if x.strip() != ''] # 提取标题 将多余空格和\n去除 re_01 = re.compile(r'\d+') number = re_01.findall(str(href)) shop_price_01 = "".join(price) print("商品价格:" + shop_price_01) # for shop_price in price: # # print("商品价格:" + shop_price) global shop_title # 全局定义商品题目 进行文件改标题 shop_title_01 = "".join(title) print("商品标题:" + shop_title_01) # for shop_title in title: # print("商品标题:" + shop_title) for index in href: global href_shop href_shop = 'http:' + index print(href_shop) for num in number: # print(num) return num except: print('爬取失败')
def screenPage(url, select): html = requests.get(url=url, headers=UserAgent.get_headers()) html.encoding = 'gbk' html = html.text soup = BeautifulSoup(html, 'lxml') return soup.select(select)[0].next_sibling.text
import scrapy import json import os import time import random import jd_home1 import jd_main import UserAgent number = jd_home1.spider_home() # 获取产品id comment_file_path = '../../../id.txt' # 文件存储位置 headers = UserAgent.get_headers() # 随机获取headers表头 class JdHomeSpider(scrapy.Spider): name = 'jd_home' # allowed_domains = [' '] # start_urls = ['http:// /'] def start_requests(self): number_page = 10 try: for page in range(1, number_page): url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={number}' \ '&score=0&sortType=5&page={page}&pageSize=10&isShadowSku=0&fold=1'.format( page=page, number=number ) yield scrapy.Request( url=url,