def parse(self, response): data = response.body.decode() #获取响应内容并解码 # items = [] #存放音乐信息的列表 titles = re.findall(r'target="play" title="(.*?)" sid=', data) #获取所有歌曲名 html = etree.HTML(data) #获取所有艺术家 artists = html.xpath('//span[@class="artistName"]/a') for i in range(0, len(titles)): item = MyspiderItem() #item对象是dict类型 item["title"] = titles[i] item["artist"] = artists[i].text yield item #使用生成器去返回每一个对象dict,比使用列表返回所有dict更快速 # items.append(item) # return items #1.获取当前请求的url,提取出页码信息 beforeurl = response.url pat = r"pageIndex=(\d)" page = re.search(pat, beforeurl).group(1) page = int(page) + 1 #2.构造下一页url if page < 5: nexturl = "http://www.htqyy.com/top/musicList/hot?pageIndex=" + str( page) + "&pageSize=20" #yield关键字表示是一个生成器,使用回调函数调用parse(),并传入下一页url yield scrapy.Request( nexturl, callback=self.parse) #执行回调函数,回调函数中的response参数就是url GET请求所获取的响应
def detail(self, response): """ 爬取详细内容 :param response: :return: """ soup = BeautifulSoup(response.text, 'html.parser') item = MyspiderItem() selector = etree.HTML(response.text) result = soup.find('script', {'type': 'application/ld+json'}) if result != None: script = json.loads(result.get_text(), strict=False) content = selector.xpath( '//*[@id="content"]/div/div[1]/div[1]/div[2]/h3/span[1]/a/@href' ) # images = soup.find_all( 'img', src=re.compile(r"/view/group_topic/l/public.*")) image_urls = [] for image in images: image_url = image['src'] image_urls.append(image_url) d1 = datetime.now() created_ = (script["dateCreated"]).replace("T", " ") d2 = datetime.strptime(created_, "%Y-%m-%d %H:%M:%S") item['creator'] = str(content[0])[30:-1] # 截取出信息创建者的豆瓣id item['title'] = script["name"] item['createDate'] = script["dateCreated"] item['text'] = script["text"] item['crawDate'] = datetime.now() item['url'] = script["url"] item['image_urls'] = image_urls # 只获取最近30天发布的帖子 if ((d1 - d2).days < 30): yield item
def parse(self, response): """ # 获取网站标题 context = response.xpath('/html/head/title/text()') # 提取网站标题 title = context.extract_first() print(title) """ # items = [] for each in response.xpath("//div[@class='li_txt']"): # 将我们得到的数据封装到一个 `ItcastItem` 对象 item = MyspiderItem() # extract()方法返回的都是unicode字符串 name = each.xpath("h3/text()").extract() title = each.xpath("h4/text()").extract() info = each.xpath("p/text()").extract() # xpath返回的是包含一个元素的列表 item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] # print(item) # items.append(item) yield item
def parse(self, response): item = MyspiderItem() for i in response.xpath( '//div[@class="tslb_b"]//tr[1]/following-sibling::*'): item['brand'] = i.xpath('.//td[2]/text()').extract() item['line'] = i.xpath('.//td[3]/text()').extract() item['car'] = i.xpath('.//td[4]/text()').extract() item['details'] = i.xpath('.//td[5]//text()').extract() item['problems'] = i.xpath('.//td[6]/text()').extract() item['date'] = i.xpath('.//td[7]/text()').extract() # items = { # 'brand': item['brand'][0], # 'line': item['line'][0], # 'car': item['car'][0], # 'details': item['details'][0], # 'problems': item['problems'][0], # 'date': item['date'][0] # } yield item for i in range(2, 3): url = self.start_url + str(i) + '.shtml' yield scrapy.Request(url, callback=self.parse)
def search_result(self, response): href = response.css(".guiji_discription a::attr(href)").extract() for item in href: pattern = re.compile( r'^(\/track\/t-)(.*)(\.htm)$') # /track/t-XXXXXXXXXXXXXX.htm matches = pattern.match(item) if matches: trackId = matches.group()[9:-4] url = "http://www.2bulu.com/space/download_track.htm?trackId={}&type=3".format( trackId) item = MyspiderItem() item["trackId"] = trackId item["itemType"] = "trackId" yield item yield scrapy.Request(url=url, callback=self.track_download_gpx, dont_filter=True) # 翻页 next_page = response.xpath("//div[@class='pages']//a/text()")[-1] if next_page.extract() == "下一页": self.pageNumber += 1 # 限制页数 if self.endPage != -1 and self.pageNumber > self.endPage: return # 搜索表单 yield from self.search_function(response) else: return
def parse(self, response): car_list = response.xpath('//*[@id="tab5"]/table/tbody/tr') for each in car_list: item = MyspiderItem() data = each.xpath('./td').extract() # print(data) for i in range(11): if i == 0: item['batch'] = data[i][4:-5].replace(u'\xa0', u'') elif i == 1: item['sn'] = data[i][4:-5].replace(u'\xa0', u'') elif i == 2: item['company_name'] = data[i][4:-5].replace(u'\xa0', u'') elif i == 3: item['kind'] = data[i][4:-5].replace(u'\xa0', u'') elif i == 4: item['vehicle_model'] = data[i][4:-5].replace(u'\xa0', u'') elif i == 5: item['common_name'] = data[i][4:-5].replace(u'\xa0', u'') elif i == 6: item['nedc'] = data[i][4:-5].replace(u'\xa0', u'') elif i == 7: item['mass'] = data[i][4:-5].replace(u'\xa0', u'') elif i == 8: item['battery_mass'] = data[i][4:-5].replace(u'\xa0', u'') elif i == 9: item['battery_energy'] = data[i][4:-5].replace( u'\xa0', u'') elif i == 10: item['comment'] = data[i][4:-5].replace(u'\xa0', u'') yield item
def parse(self, response): data = response.body.decode() #获取相应内容 # items = [] #存放音乐信息的列表 # 获取所有的歌曲名 pat = re.compile('target="play" title="(.*?)"', re.I) titles = pat.findall(data) # 获取所有的艺术家 pat = re.compile('target="_blank">(.*?)</a>', re.I) artists = pat.findall(data) for i in range(0, len(titles)): item = MyspiderItem() item['title'] = titles[i] item['artist'] = artists[i] yield item # 获取当前请求的url,ti取出页码信息 beforeurl = response.url pat1 = 'pageIndex=(\d)' page = re.search(pat1, beforeurl).group(1) page = int(page) + 1 # 得到下一次请求的pageIndex if page < 5: # 构造下一页url nexturl = 'http://www.htqyy.com/top/musicList/hot?pageIndex=' + str( page) + '&pageSize=20' # 发送下一次请求 yield scrapy.Request(nexturl, callback=self.parse)
def parse(self, response): node_list = response.xpath('//*[@id="content_left"]/div') # with open('baidu_xpath.text', 'w', encoding='utf-8')as f: # f.writelines(node_list.extract()) for quote in node_list: item = MyspiderItem() # 筛选掉广告 if len(quote.xpath('./h3')) > 0: name_baidu_re = quote.xpath('./h3/a').extract_first() # print("q"*30, "\n", name_baidu_re) try: # 用正则表达式清洗数据,解决em的问题 ZZ = '_blank\\"\>.*' name_baidu = re.search(ZZ, name_baidu_re).group().replace( "_blank\">", "").replace("<em>", "").replace("</em>", "").replace("</a>", "") # print("*" * 30, "\n", name_baidu) link_baidu = quote.xpath('./h3/a/@href').extract_first() item['name_baidu'] = name_baidu item['link_baidu'] = link_baidu yield item except: pass
def parse(self, response): # 获取所有书籍 books = response.xpath('//li[@class="result_name"]/..') url_list = response.xpath( '//li[@class="result_name"]/a/@href').extract() # 将数据封装到item对象中 item = MyspiderItem() for book, url in zip(books, url_list): # 从书籍中解析信息 item['name'] = book.xpath( './li[@class="result_name"]/a[1]/text()').extract()[0] # item['info'] = book.xpath('./li[@class="result_name"]/../li[2]/text()').extract()[0] item['price'] = '¥' + \ book.xpath('./li[@class="result_book"]/ul/li[@class="book_dis"]/b/text()').re('¥(\d*\.\d*)')[0] # print(item) item_1 = str(item) yield scrapy.Request(url=url, meta={'item': item_1}, callback=self.parse_content, dont_filter=False) # print(type(item_1)) # yield item # 查找下一页的元素,获取URL next = response.css('#right .pro_pag a::attr("href")').extract()[-1] url = response.urljoin(next) yield scrapy.Request(url=url, callback=self.parse, dont_filter=False)
def parse(self, response): div_list = response.xpath('//div[@class="left"]/div[@class="sons"]') for div in div_list: titles = div.xpath('.//b/text()').extract_first() author = div.xpath('.//p[@class="source"]//text()').extract() contents = div.xpath('.//div[@class="contson"]//text()').extract() contents = ''.join(contents).strip() # poem = {} item = MyspiderItem() if titles != None: item["标题"] = titles item["作者"] = author item["内容"] = contents yield item # print(poem) href = response.xpath('//div[@class="pagesright"]/a[@id="amore"]/@href').extract_first() try: if len(href) != 0: href = response.urljoin(href) yield scrapy.Request( href, callback=self.parse, ) except: pass
def parse(self, response): # 处理start_url的响应 response_xpath = response.xpath("//title/text()").extract() print(response_xpath) item = MyspiderItem() item["name"] = {"name": "张三", "age": 19} yield item
def detail_parse(self, response): jtext = response.text data = json.loads(jtext) ep_list = data['data']['ep_list'] item = MyspiderItem() item['comic_id'] = data['data']['id'] item['name'] = data['data']['title'] item['author'] = data['data']['author_name'] item['cover'] = data['data']['vertical_cover'] item['intr'] = data['data']['evaluate'] item['last_short_title'] = data['data']['last_short_title'] for ep in ep_list: form_data = {"ep_id": ep['id']} item = copy.deepcopy(item) item['chapter_title'] = ep['title'] item['chapter_id'] = ep['id'] item['chapter_short_title'] = ep['short_title'] item['chapter_time'] = ep['pub_time'] target_url = 'https://manga.bilibili.com/twirp/comic.v1.Comic/GetImageIndex?device=pc&platform=web' # self.log(item) # self.log('qqqqqqqqqq') yield scrapy.Request(target_url, body=json.dumps(form_data), method='POST', callback=self.comic_info, meta={'item': item}, headers=self.headers)
def parse(self, response): # print(response.text) movie_list = response.xpath( "//div[@class='article']//ol[@class='grid_view']/li") for i_item in movie_list: douban_item = MyspiderItem() # 获取电影序号 douban_item['serial_number'] = i_item.xpath( ".//div[@class='item']//em/text()").extract_first() # 获取电影名称 douban_item['movie_name'] = i_item.xpath( ".//div[@class='hd']/a/span/text()").extract_first() # 获取电影介绍 content = i_item.xpath( ".//div[@class='bd']//p[1]/text()").extract() for i_content in content: content_s = "".join(i_content.split()) douban_item['introduction'] = content_s # 获取电影简介 douban_item['evaluate'] = i_item.xpath( ".//div[@class='star']//span[4]/text()").extract_first() # 获取电影的评分 douban_item['stars'] = i_item.xpath( ".//div[@class='star']/span[2]/text()").extract_first() # 获取电影的评价 douban_item['describe'] = i_item.xpath( ".//p[@class='quote']/span/text()").extract_first() yield douban_item next_link = response.xpath( "//span[@class='next']/link/@href").extract() if next_link: next_link = next_link[0] yield scrapy.Request('http://movie.douban.com/top250' + next_link, callback=self.parse)
def parse_info(self, response): # &since_id={since_id} result = json.loads(response.text) if result.get('ok') and result.get('data').get('cards'): since_id = result.get('data').get('cardlistInfo').get('since_id') auth = response.meta['auth'] uid = response.meta['uid'] weibos = result.get('data').get('cards') for weibo in weibos: mblog = weibo.get('mblog') if mblog and mblog.get('pics'): pics = mblog.get('pics') for pic in pics: picurl = pic.get('large').get('url') print(since_id, auth, picurl) path = 'd://pic2//' + auth + '//' + re.sub( 'https://wx\\d.sinaimg.cn/large/', '', picurl) if not os.path.exists(path): item = MyspiderItem() item['auth'] = auth item['images_urls'] = [picurl] yield item url = self.weibo_url.format( uid=uid) + '&since_id=' + str(since_id) request = scrapy.Request(url, callback=self.parse_info) request.meta['auth'] = auth request.meta['uid'] = uid yield request
def parse(self, response): self.log('Hi, this is an item pages! %s' % response.url) item = MyspiderItem() #item['url'] = response.xpath('//td[@class="first-child bz_id_column"]/a/@href').extract() #print item['url'] item['bug_id'] = response.xpath('//td[@class="first-child bz_id_column"]/a/text()').extract() return item
def parse(self, response): ## filename="teacher.html" ## open(filename,'w').write(response.body) ## #获取网站标题 ## context=response.xpath('/html/head/title/text()') ## ## #提取网站标题 ## title=context.extract_first() ## print(title) ## pass #存放老师信息的集合 items=[] for each in response.xpath("/html/body/div[1]/div[5]/div[2]/div[1]/ul/li[1]/div[2]"): item=MyspiderItem() #extract()方法返回的都是unicode字符串 name=each.xpath("h3/text()").extract() title=each.xpath("h4/text()").extract() info=each.xpath("p/text()").extract() ## print (name,title,info) #xpath返回的是一个元素的列表 item['name']=name[0] item['title']=title[0] item['info']=info[0] items.append(item) #直接返回最后的数据 return items
def parse(self, response): for each in response.xpath( '/html/body/div[2]/div/div/div[2]/div/div[2]/div/ul/li'): item = MyspiderItem() name = each.xpath('./div[1]/p/text()').extract() item['name'] = name[0] if len(name) > 0 else '' work = each.xpath('./div[2]/dl[1]/dd/text()').extract() item['work'] = work[0] if len(work) > 0 else '' tellphone = each.xpath('./div[2]/dl[2]/dd/text()').extract() item['tellphone'] = tellphone[0] if len(tellphone) > 0 else '' fax = each.xpath('./div[2]/dl[3]/dd/text()').extract() item['fax'] = fax[0] if len(fax) > 0 else '' email = each.xpath('./div[2]/dl[4]/dd/a/text()').extract() item['email'] = email[0] if len(email) > 0 else '' researchDirection = each.xpath( './div[2]/dl[5]/dd/text()').extract() item['researchDirection'] = researchDirection[0] if len( researchDirection) > 0 else '' yield item if self.offset > 1: self.offset -= 1 yield scrapy.Request(self.url + str(self.offset) + '.htm', callback=self.parse)
def parse(self, response): # filename = "teacher.html" # open(filename, 'wb').write(response.body) items = [] tmp = response.xpath( "//form[@id='ajaxtable']/div[@class='show-list']/ul[@class='for-list']/*" ) # for each in tmp: # 将我们得到的数据封装到一个 `ItcastItem` 对象 item = MyspiderItem() # extract()方法返回的都是unicode字符串 url = each.xpath( "div[@class='titlelink box']/a[@class='truetit']/@href" ).extract_first() name = each.xpath( "div[@class='titlelink box']/a[@class='truetit']/text()" ).extract() # xpath返回的是包含一个元素的列表 item['url'] = self.domain + url item['name'] = name # yield item items.append(item) # 直接返回最后数据 return items
def parse(self, response): node_list = response.xpath("//div[@id='resultList']/div[@class='el']") for node in node_list: item = MyspiderItem() # print(node.xpath("/p/span/a/text()")) item['positionName'] = node.xpath( "./p/span/a/text()").extract_first(default="").strip() item['positionLink'] = node.xpath( "./p/span/a/@href").extract_first(default="").strip() item['companyName'] = node.xpath( "./span[1]/a/text()").extract_first(default="").strip() item['workLocation'] = node.xpath( "./span[2]/text()").extract_first(default="").strip() item['salary'] = node.xpath("./span[3]/text()").extract_first( default="").strip() item['publishTime'] = node.xpath("./span[4]/text()").extract_first( default="").strip() yield item if self.offset < 20: self.offset += 1 url = self.baseURL % (self.offset) yield scrapy.Request(url, callback=self.parse)
def parse(self, response): teacher_list = response.xpath('//div[@class="li_txt"]') for each in teacher_list: item = MyspiderItem() item['name'] = each.xpath('./h3/text()').extract()[0] item['title'] = each.xpath('./h4/text()').extract()[0] item['info'] = each.xpath('./p/text()').extract()[0] yield item
def parse(self, response): teacher_list = response.xpath('/html/body/div[1]/div[5]/div[2]/div[3]') for i in teacher_list: item = MyspiderItem() name = i.xpath("./ul/li[1]/div[2]/h3/text()").extract_first() item["name"] = name print(name) print("*"*50) yield item
def track_download_gpx(self, response): code = json.loads(response.text)["code"] if code == "2": url = json.loads(response.text)["url"] item = MyspiderItem() item["gpxDownloadUrl"] = url item["itemType"] = "gpxDownloadUrl" yield item else: logger.error("[GPX] 无法获取 gpx 下载链接。")
def parse(self, response): for each in response.xpath('//div[@class="li_txt"]'): item = MyspiderItem() item['name'] = each.xpath('./h3/text()').extract()[0] item['level'] = each.xpath('./h4/text()').extract()[0] item['desc'] = each.xpath('./p/text()').extract()[0] yield item
def parse(self, response): item = MyspiderItem() domain = response.url.split("/")[-2] # filename = '%s.html' % domain # with open(filename, 'wb') as f: # f.write(response.body) try: if domain == 'weather1d': current_tem = response.xpath( "//div[@class='tem']/span/text()").extract() humidity = response.xpath( "//div[@class='zs h']/em/text()").extract() wind_direct = response.xpath( "//div[@class='zs w']/span/text()").extract() wind_level = response.xpath( "//div[@class='zs w']/em/text()").extract() item['current_tem'] = float(current_tem[0]) item['humidity'] = float(humidity[0].strip('%')) item['wind_direct'] = wind_direct[0] item['wind_level'] = float(wind_level[0].strip('级')) elif domain == 'weather': seven_day = response.xpath( "//div[@id='7d']/ul[@class='t clearfix']") seven_day_lt = [] for i in range(1, 8): low_temp_lt = seven_day.xpath( "li[%s]/p[@class='tem']/span/text()" % str(i)).extract() high_temp_lt = seven_day.xpath( "li[%s]/p[@class='tem']/i/text()" % str(i)).extract() # print(low_temp_lt, high_temp_lt) low_temp = low_temp_lt[0] if low_temp_lt else '' high_temp = high_temp_lt[0] if high_temp_lt else '' seven_day_lt.append( low_temp.strip('℃') + '/' + high_temp.strip('℃')) for i in range(len(seven_day_lt)): item['tem_7d' + str(i)] = seven_day_lt[i] elif domain == 'city': pm25 = response.xpath( "//div[@class='panel']/b/text()").extract() # print('PM2.5:', pm25) item['pm25'] = float(pm25[0]) except: print(traceback.format_exc()) item['city'] = "yanqing" yield item
def getdloadurl(self, response): item = MyspiderItem() storyname = response.xpath( "//div[@class='Mp3ErweimaText']/p/span/text()").extract() stroyDLK = response.xpath( "//div[@class='downloadboxlist']/p/a/@href").extract() item['name'] = storyname[0] item['linkDX'] = stroyDLK[0] item['linkLT'] = stroyDLK[1] # print(item) return item
def parse(self, response): area_list = response.xpath( "//span[@class='elems-l']/div/a[position()>1]") for area in area_list: item = MyspiderItem() item['area_name'] = area.xpath("./text()").extract_first() item['area_href'] = area.xpath("./@href").extract_first() # print(item) yield scrapy.Request( item['area_href'], callback=self.parse_detail, meta={"item": item}, )
def parse(self, response): # 提取所有老师的节点 node_list = response.xpath("//div[@class='li_txt']") # item_list = [] # 迭代每个老师节点,并创建item对象保存信息 for node in node_list: item = MyspiderItem() item['name'] = node.xpath("./h3/text()").extract_first() item['title'] = node.xpath("./h4/text()").extract_first() item['info'] = node.xpath('./p/text()').extract_first() yield item
def parse(self, response): teacher_list = response.xpath('//div[@class="li_txt"]') teacherItem = [] for each in teacher_list: item = MyspiderItem() name = each.xpath('./h3/text()').extract() title = each.xpath('./h4/text()').extract() info = each.xpath('./p/text()').extract() item['name'] = name[0].encode('gbk') item['title'] = title[0].encode('gbk') item['info'] = info[0].encode('gbk') yield item
def parse(self, response): note_list = response.xpath("//div[@class='li_txt']") # items=[] for node in note_list: item = MyspiderItem() name = node.xpath("./h3/text()").extract() title = node.xpath("./h4/text()").extract() info = node.xpath("./p/text()").extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] yield item
def parse(self, response): # filename="music.html" data = response.body.decode(encoding="utf-8", errors="ignore") #获取响应内容 decode()是解码的意思,从html字节码解码成二进制 # open(filename,"wb").write(data)#写入本地 #<a href='/play/16315.html' target="play">06_8号交响曲</a> pat1 = re.compile(r'<a href=\'.*?\' target="play">(.*?)</a>') pat2 = re.compile(r"<a href='(.*?)' target=\"play\"") title = re.findall(pat1,data) songUrl=re.findall(pat2,data) # items=[] for i in range(0,len(title)): item = MyspiderItem() item["title"] = title[i] item["songUrl"] ="http://www.130v.com/"+ songUrl[i] yield item #每构建一个item生成器就返回给Pipeline 相当多线程啦