コード例 #1
0
ファイル: htqyy_musicSpider.py プロジェクト: jacknotes/job
 def parse(self, response):
     data = response.body.decode()  #获取响应内容并解码
     # items = []  #存放音乐信息的列表
     titles = re.findall(r'target="play" title="(.*?)" sid=',
                         data)  #获取所有歌曲名
     html = etree.HTML(data)  #获取所有艺术家
     artists = html.xpath('//span[@class="artistName"]/a')
     for i in range(0, len(titles)):
         item = MyspiderItem()  #item对象是dict类型
         item["title"] = titles[i]
         item["artist"] = artists[i].text
         yield item  #使用生成器去返回每一个对象dict,比使用列表返回所有dict更快速
         # items.append(item)
     # return items
     #1.获取当前请求的url,提取出页码信息
     beforeurl = response.url
     pat = r"pageIndex=(\d)"
     page = re.search(pat, beforeurl).group(1)
     page = int(page) + 1
     #2.构造下一页url
     if page < 5:
         nexturl = "http://www.htqyy.com/top/musicList/hot?pageIndex=" + str(
             page) + "&pageSize=20"
         #yield关键字表示是一个生成器,使用回调函数调用parse(),并传入下一页url
         yield scrapy.Request(
             nexturl,
             callback=self.parse)  #执行回调函数,回调函数中的response参数就是url GET请求所获取的响应
コード例 #2
0
    def detail(self, response):
        """
        爬取详细内容
        :param response:
        :return:
        """

        soup = BeautifulSoup(response.text, 'html.parser')
        item = MyspiderItem()
        selector = etree.HTML(response.text)
        result = soup.find('script', {'type': 'application/ld+json'})
        if result != None:
            script = json.loads(result.get_text(), strict=False)
            content = selector.xpath(
                '//*[@id="content"]/div/div[1]/div[1]/div[2]/h3/span[1]/a/@href'
            )  #
            images = soup.find_all(
                'img', src=re.compile(r"/view/group_topic/l/public.*"))
            image_urls = []
            for image in images:
                image_url = image['src']
                image_urls.append(image_url)
            d1 = datetime.now()
            created_ = (script["dateCreated"]).replace("T", " ")
            d2 = datetime.strptime(created_, "%Y-%m-%d %H:%M:%S")
            item['creator'] = str(content[0])[30:-1]  # 截取出信息创建者的豆瓣id
            item['title'] = script["name"]
            item['createDate'] = script["dateCreated"]
            item['text'] = script["text"]
            item['crawDate'] = datetime.now()
            item['url'] = script["url"]
            item['image_urls'] = image_urls
            # 只获取最近30天发布的帖子
            if ((d1 - d2).days < 30):
                yield item
コード例 #3
0
ファイル: itcast.py プロジェクト: windleafy/my_py_project
    def parse(self, response):
        """
        # 获取网站标题
        context = response.xpath('/html/head/title/text()')
        # 提取网站标题
        title = context.extract_first()
        print(title)
        """

        # items = []
        for each in response.xpath("//div[@class='li_txt']"):
            # 将我们得到的数据封装到一个 `ItcastItem` 对象
            item = MyspiderItem()
            # extract()方法返回的都是unicode字符串
            name = each.xpath("h3/text()").extract()
            title = each.xpath("h4/text()").extract()
            info = each.xpath("p/text()").extract()

            # xpath返回的是包含一个元素的列表
            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]
            # print(item)
            # items.append(item)
            yield item
コード例 #4
0
    def parse(self, response):

        item = MyspiderItem()
        for i in response.xpath(
                '//div[@class="tslb_b"]//tr[1]/following-sibling::*'):
            item['brand'] = i.xpath('.//td[2]/text()').extract()
            item['line'] = i.xpath('.//td[3]/text()').extract()
            item['car'] = i.xpath('.//td[4]/text()').extract()
            item['details'] = i.xpath('.//td[5]//text()').extract()
            item['problems'] = i.xpath('.//td[6]/text()').extract()
            item['date'] = i.xpath('.//td[7]/text()').extract()

            # items = {
            #     'brand': item['brand'][0],
            #     'line': item['line'][0],
            #     'car': item['car'][0],
            #     'details': item['details'][0],
            #     'problems': item['problems'][0],
            #     'date': item['date'][0]
            # }

            yield item

        for i in range(2, 3):
            url = self.start_url + str(i) + '.shtml'
            yield scrapy.Request(url, callback=self.parse)
コード例 #5
0
    def search_result(self, response):
        href = response.css(".guiji_discription a::attr(href)").extract()
        for item in href:
            pattern = re.compile(
                r'^(\/track\/t-)(.*)(\.htm)$')  # /track/t-XXXXXXXXXXXXXX.htm
            matches = pattern.match(item)
            if matches:
                trackId = matches.group()[9:-4]
                url = "http://www.2bulu.com/space/download_track.htm?trackId={}&type=3".format(
                    trackId)

                item = MyspiderItem()
                item["trackId"] = trackId
                item["itemType"] = "trackId"
                yield item

                yield scrapy.Request(url=url,
                                     callback=self.track_download_gpx,
                                     dont_filter=True)
        # 翻页
        next_page = response.xpath("//div[@class='pages']//a/text()")[-1]
        if next_page.extract() == "下一页":
            self.pageNumber += 1

            # 限制页数
            if self.endPage != -1 and self.pageNumber > self.endPage: return

            # 搜索表单
            yield from self.search_function(response)
        else:
            return
コード例 #6
0
ファイル: itcast.py プロジェクト: kangkang006/myspider
 def parse(self, response):
     car_list = response.xpath('//*[@id="tab5"]/table/tbody/tr')
     for each in car_list:
         item = MyspiderItem()
         data = each.xpath('./td').extract()
         # print(data)
         for i in range(11):
             if i == 0:
                 item['batch'] = data[i][4:-5].replace(u'\xa0', u'')
             elif i == 1:
                 item['sn'] = data[i][4:-5].replace(u'\xa0', u'')
             elif i == 2:
                 item['company_name'] = data[i][4:-5].replace(u'\xa0', u'')
             elif i == 3:
                 item['kind'] = data[i][4:-5].replace(u'\xa0', u'')
             elif i == 4:
                 item['vehicle_model'] = data[i][4:-5].replace(u'\xa0', u'')
             elif i == 5:
                 item['common_name'] = data[i][4:-5].replace(u'\xa0', u'')
             elif i == 6:
                 item['nedc'] = data[i][4:-5].replace(u'\xa0', u'')
             elif i == 7:
                 item['mass'] = data[i][4:-5].replace(u'\xa0', u'')
             elif i == 8:
                 item['battery_mass'] = data[i][4:-5].replace(u'\xa0', u'')
             elif i == 9:
                 item['battery_energy'] = data[i][4:-5].replace(
                     u'\xa0', u'')
             elif i == 10:
                 item['comment'] = data[i][4:-5].replace(u'\xa0', u'')
         yield item
コード例 #7
0
    def parse(self, response):
        data = response.body.decode()  #获取相应内容
        # items = [] #存放音乐信息的列表

        # 获取所有的歌曲名
        pat = re.compile('target="play" title="(.*?)"', re.I)
        titles = pat.findall(data)
        # 获取所有的艺术家
        pat = re.compile('target="_blank">(.*?)</a>', re.I)
        artists = pat.findall(data)

        for i in range(0, len(titles)):
            item = MyspiderItem()
            item['title'] = titles[i]
            item['artist'] = artists[i]
            yield item

        # 获取当前请求的url,ti取出页码信息
        beforeurl = response.url
        pat1 = 'pageIndex=(\d)'
        page = re.search(pat1, beforeurl).group(1)
        page = int(page) + 1  # 得到下一次请求的pageIndex
        if page < 5:
            # 构造下一页url
            nexturl = 'http://www.htqyy.com/top/musicList/hot?pageIndex=' + str(
                page) + '&pageSize=20'
            # 发送下一次请求
            yield scrapy.Request(nexturl, callback=self.parse)
コード例 #8
0
    def parse(self, response):

        node_list = response.xpath('//*[@id="content_left"]/div')
        # with open('baidu_xpath.text', 'w', encoding='utf-8')as f:
        #   f.writelines(node_list.extract())

        for quote in node_list:
            item = MyspiderItem()

            # 筛选掉广告
            if len(quote.xpath('./h3')) > 0:
                name_baidu_re = quote.xpath('./h3/a').extract_first()
                # print("q"*30, "\n", name_baidu_re)
                try:
                    # 用正则表达式清洗数据,解决em的问题
                    ZZ = '_blank\\"\>.*'
                    name_baidu = re.search(ZZ, name_baidu_re).group().replace(
                        "_blank\">",
                        "").replace("<em>",
                                    "").replace("</em>",
                                                "").replace("</a>", "")
                    # print("*" * 30, "\n", name_baidu)
                    link_baidu = quote.xpath('./h3/a/@href').extract_first()

                    item['name_baidu'] = name_baidu
                    item['link_baidu'] = link_baidu
                    yield item
                except:
                    pass
コード例 #9
0
    def parse(self, response):
        # 获取所有书籍
        books = response.xpath('//li[@class="result_name"]/..')
        url_list = response.xpath(
            '//li[@class="result_name"]/a/@href').extract()

        # 将数据封装到item对象中
        item = MyspiderItem()
        for book, url in zip(books, url_list):
            # 从书籍中解析信息
            item['name'] = book.xpath(
                './li[@class="result_name"]/a[1]/text()').extract()[0]
            # item['info'] = book.xpath('./li[@class="result_name"]/../li[2]/text()').extract()[0]
            item['price'] = '¥' + \
            book.xpath('./li[@class="result_book"]/ul/li[@class="book_dis"]/b/text()').re('¥(\d*\.\d*)')[0]
            # print(item)
            item_1 = str(item)
            yield scrapy.Request(url=url,
                                 meta={'item': item_1},
                                 callback=self.parse_content,
                                 dont_filter=False)

            # print(type(item_1))

            # yield item

        # 查找下一页的元素,获取URL
        next = response.css('#right .pro_pag a::attr("href")').extract()[-1]
        url = response.urljoin(next)
        yield scrapy.Request(url=url, callback=self.parse, dont_filter=False)
コード例 #10
0
	def parse(self, response):

		div_list = response.xpath('//div[@class="left"]/div[@class="sons"]')
		for div in div_list:
			titles = div.xpath('.//b/text()').extract_first()
			author = div.xpath('.//p[@class="source"]//text()').extract()
			contents = div.xpath('.//div[@class="contson"]//text()').extract()
			contents = ''.join(contents).strip()
			# poem = {}
			item = MyspiderItem()
			if titles != None:
				item["标题"] = titles
				item["作者"] = author
				item["内容"] = contents
				yield item
				# print(poem)

		href = response.xpath('//div[@class="pagesright"]/a[@id="amore"]/@href').extract_first()

		try:
			if len(href) != 0:
				href = response.urljoin(href)
				yield scrapy.Request(
					href,
					callback=self.parse,
				)
		except:
			pass
コード例 #11
0
 def parse(self, response):
     # 处理start_url的响应
     response_xpath = response.xpath("//title/text()").extract()
     print(response_xpath)
     item = MyspiderItem()
     item["name"] = {"name": "张三", "age": 19}
     yield item
コード例 #12
0
ファイル: manhua163.py プロジェクト: LoyalWilliams/comic
 def detail_parse(self, response):
     jtext = response.text
     data = json.loads(jtext)
     ep_list = data['data']['ep_list']
     item = MyspiderItem()
     item['comic_id'] = data['data']['id']
     item['name'] = data['data']['title']
     item['author'] = data['data']['author_name']
     item['cover'] = data['data']['vertical_cover']
     item['intr'] = data['data']['evaluate']
     item['last_short_title'] = data['data']['last_short_title']
     for ep in ep_list:
         form_data = {"ep_id": ep['id']}
         item = copy.deepcopy(item)
         item['chapter_title'] = ep['title']
         item['chapter_id'] = ep['id']
         item['chapter_short_title'] = ep['short_title']
         item['chapter_time'] = ep['pub_time']
         target_url = 'https://manga.bilibili.com/twirp/comic.v1.Comic/GetImageIndex?device=pc&platform=web'
         # self.log(item)
         # self.log('qqqqqqqqqq')
         yield scrapy.Request(target_url,
                              body=json.dumps(form_data),
                              method='POST',
                              callback=self.comic_info,
                              meta={'item': item},
                              headers=self.headers)
コード例 #13
0
 def parse(self, response):
     # print(response.text)
     movie_list = response.xpath(
         "//div[@class='article']//ol[@class='grid_view']/li")
     for i_item in movie_list:
         douban_item = MyspiderItem()
         # 获取电影序号
         douban_item['serial_number'] = i_item.xpath(
             ".//div[@class='item']//em/text()").extract_first()
         # 获取电影名称
         douban_item['movie_name'] = i_item.xpath(
             ".//div[@class='hd']/a/span/text()").extract_first()
         # 获取电影介绍
         content = i_item.xpath(
             ".//div[@class='bd']//p[1]/text()").extract()
         for i_content in content:
             content_s = "".join(i_content.split())
             douban_item['introduction'] = content_s
         # 获取电影简介
         douban_item['evaluate'] = i_item.xpath(
             ".//div[@class='star']//span[4]/text()").extract_first()
         # 获取电影的评分
         douban_item['stars'] = i_item.xpath(
             ".//div[@class='star']/span[2]/text()").extract_first()
         # 获取电影的评价
         douban_item['describe'] = i_item.xpath(
             ".//p[@class='quote']/span/text()").extract_first()
         yield douban_item
     next_link = response.xpath(
         "//span[@class='next']/link/@href").extract()
     if next_link:
         next_link = next_link[0]
         yield scrapy.Request('http://movie.douban.com/top250' + next_link,
                              callback=self.parse)
コード例 #14
0
 def parse_info(self, response):
     # &since_id={since_id}
     result = json.loads(response.text)
     if result.get('ok') and result.get('data').get('cards'):
         since_id = result.get('data').get('cardlistInfo').get('since_id')
         auth = response.meta['auth']
         uid = response.meta['uid']
         weibos = result.get('data').get('cards')
         for weibo in weibos:
             mblog = weibo.get('mblog')
             if mblog and mblog.get('pics'):
                 pics = mblog.get('pics')
                 for pic in pics:
                     picurl = pic.get('large').get('url')
                     print(since_id, auth, picurl)
                     path = 'd://pic2//' + auth + '//' + re.sub(
                         'https://wx\\d.sinaimg.cn/large/', '', picurl)
                     if not os.path.exists(path):
                         item = MyspiderItem()
                         item['auth'] = auth
                         item['images_urls'] = [picurl]
                         yield item
                         url = self.weibo_url.format(
                             uid=uid) + '&since_id=' + str(since_id)
                         request = scrapy.Request(url,
                                                  callback=self.parse_info)
                         request.meta['auth'] = auth
                         request.meta['uid'] = uid
                         yield request
コード例 #15
0
ファイル: my_spider.py プロジェクト: CarrieHua/scrapycrawl
 def parse(self, response):
      self.log('Hi, this is an item pages! %s' % response.url)
      item = MyspiderItem()
      #item['url'] = response.xpath('//td[@class="first-child bz_id_column"]/a/@href').extract()
      #print item['url']
      item['bug_id'] = response.xpath('//td[@class="first-child bz_id_column"]/a/text()').extract()
      return item
コード例 #16
0
ファイル: itcast.py プロジェクト: tuess/Scrapy
    def parse(self, response):
##        filename="teacher.html"
##        open(filename,'w').write(response.body)
        
##            #获取网站标题
##            context=response.xpath('/html/head/title/text()')
##
##            #提取网站标题
##            title=context.extract_first()
##            print(title)
##            pass

                #存放老师信息的集合
                items=[]

                for each in response.xpath("/html/body/div[1]/div[5]/div[2]/div[1]/ul/li[1]/div[2]"):
                    item=MyspiderItem()
                    #extract()方法返回的都是unicode字符串
                    name=each.xpath("h3/text()").extract()
                    title=each.xpath("h4/text()").extract()
                    info=each.xpath("p/text()").extract()
                    
##                    print (name,title,info)

                    #xpath返回的是一个元素的列表
                    item['name']=name[0]
                    item['title']=title[0]
                    item['info']=info[0]

                    items.append(item)
                    
                #直接返回最后的数据
                return items
コード例 #17
0
    def parse(self, response):
        for each in response.xpath(
                '/html/body/div[2]/div/div/div[2]/div/div[2]/div/ul/li'):
            item = MyspiderItem()

            name = each.xpath('./div[1]/p/text()').extract()
            item['name'] = name[0] if len(name) > 0 else ''
            work = each.xpath('./div[2]/dl[1]/dd/text()').extract()
            item['work'] = work[0] if len(work) > 0 else ''
            tellphone = each.xpath('./div[2]/dl[2]/dd/text()').extract()
            item['tellphone'] = tellphone[0] if len(tellphone) > 0 else ''
            fax = each.xpath('./div[2]/dl[3]/dd/text()').extract()
            item['fax'] = fax[0] if len(fax) > 0 else ''
            email = each.xpath('./div[2]/dl[4]/dd/a/text()').extract()
            item['email'] = email[0] if len(email) > 0 else ''
            researchDirection = each.xpath(
                './div[2]/dl[5]/dd/text()').extract()
            item['researchDirection'] = researchDirection[0] if len(
                researchDirection) > 0 else ''
            yield item

        if self.offset > 1:
            self.offset -= 1

        yield scrapy.Request(self.url + str(self.offset) + '.htm',
                             callback=self.parse)
コード例 #18
0
ファイル: itcast.py プロジェクト: Summitofpyramid/goSpider
    def parse(self, response):

        # filename = "teacher.html"
        # open(filename, 'wb').write(response.body)
        items = []
        tmp = response.xpath(
            "//form[@id='ajaxtable']/div[@class='show-list']/ul[@class='for-list']/*"
        )
        #
        for each in tmp:
            # 将我们得到的数据封装到一个 `ItcastItem` 对象
            item = MyspiderItem()
            # extract()方法返回的都是unicode字符串
            url = each.xpath(
                "div[@class='titlelink box']/a[@class='truetit']/@href"
            ).extract_first()
            name = each.xpath(
                "div[@class='titlelink box']/a[@class='truetit']/text()"
            ).extract()

            # xpath返回的是包含一个元素的列表
            item['url'] = self.domain + url
            item['name'] = name

            # yield item

            items.append(item)

        # 直接返回最后数据
        return items
コード例 #19
0
    def parse(self, response):
        node_list = response.xpath("//div[@id='resultList']/div[@class='el']")
        for node in node_list:
            item = MyspiderItem()
            # print(node.xpath("/p/span/a/text()"))
            item['positionName'] = node.xpath(
                "./p/span/a/text()").extract_first(default="").strip()

            item['positionLink'] = node.xpath(
                "./p/span/a/@href").extract_first(default="").strip()

            item['companyName'] = node.xpath(
                "./span[1]/a/text()").extract_first(default="").strip()

            item['workLocation'] = node.xpath(
                "./span[2]/text()").extract_first(default="").strip()

            item['salary'] = node.xpath("./span[3]/text()").extract_first(
                default="").strip()

            item['publishTime'] = node.xpath("./span[4]/text()").extract_first(
                default="").strip()

            yield item

            if self.offset < 20:
                self.offset += 1
                url = self.baseURL % (self.offset)
                yield scrapy.Request(url, callback=self.parse)
コード例 #20
0
 def parse(self, response):
     teacher_list = response.xpath('//div[@class="li_txt"]')
     for each in teacher_list:
         item = MyspiderItem()
         item['name'] = each.xpath('./h3/text()').extract()[0]
         item['title'] = each.xpath('./h4/text()').extract()[0]
         item['info'] = each.xpath('./p/text()').extract()[0]
         yield item
コード例 #21
0
    def parse(self, response):
        teacher_list = response.xpath('/html/body/div[1]/div[5]/div[2]/div[3]')
        for i in teacher_list:
            item = MyspiderItem()
            name = i.xpath("./ul/li[1]/div[2]/h3/text()").extract_first()
            item["name"] = name
            print(name)
            print("*"*50)

            yield item
コード例 #22
0
ファイル: a2bulu.py プロジェクト: Sglight/2bulu_scrapy
 def track_download_gpx(self, response):
     code = json.loads(response.text)["code"]
     if code == "2":
         url = json.loads(response.text)["url"]
         item = MyspiderItem()
         item["gpxDownloadUrl"] = url
         item["itemType"] = "gpxDownloadUrl"
         yield item
     else:
         logger.error("[GPX] 无法获取 gpx 下载链接。")
コード例 #23
0
ファイル: itcast.py プロジェクト: MarRoar/Python-code
    def parse(self, response):
        for each in response.xpath('//div[@class="li_txt"]'):

            item = MyspiderItem()

            item['name'] = each.xpath('./h3/text()').extract()[0]
            item['level'] = each.xpath('./h4/text()').extract()[0]
            item['desc'] = each.xpath('./p/text()').extract()[0]

            yield item
コード例 #24
0
    def parse(self, response):
        item = MyspiderItem()
        domain = response.url.split("/")[-2]

        # filename = '%s.html' % domain
        # with open(filename, 'wb') as f:
        #     f.write(response.body)
        try:
            if domain == 'weather1d':
                current_tem = response.xpath(
                    "//div[@class='tem']/span/text()").extract()
                humidity = response.xpath(
                    "//div[@class='zs h']/em/text()").extract()
                wind_direct = response.xpath(
                    "//div[@class='zs w']/span/text()").extract()
                wind_level = response.xpath(
                    "//div[@class='zs w']/em/text()").extract()

                item['current_tem'] = float(current_tem[0])
                item['humidity'] = float(humidity[0].strip('%'))
                item['wind_direct'] = wind_direct[0]
                item['wind_level'] = float(wind_level[0].strip('级'))

            elif domain == 'weather':
                seven_day = response.xpath(
                    "//div[@id='7d']/ul[@class='t clearfix']")
                seven_day_lt = []

                for i in range(1, 8):
                    low_temp_lt = seven_day.xpath(
                        "li[%s]/p[@class='tem']/span/text()" %
                        str(i)).extract()
                    high_temp_lt = seven_day.xpath(
                        "li[%s]/p[@class='tem']/i/text()" % str(i)).extract()
                    # print(low_temp_lt, high_temp_lt)

                    low_temp = low_temp_lt[0] if low_temp_lt else ''
                    high_temp = high_temp_lt[0] if high_temp_lt else ''
                    seven_day_lt.append(
                        low_temp.strip('℃') + '/' + high_temp.strip('℃'))

                for i in range(len(seven_day_lt)):
                    item['tem_7d' + str(i)] = seven_day_lt[i]

            elif domain == 'city':
                pm25 = response.xpath(
                    "//div[@class='panel']/b/text()").extract()
                # print('PM2.5:', pm25)
                item['pm25'] = float(pm25[0])
        except:
            print(traceback.format_exc())

        item['city'] = "yanqing"
        yield item
コード例 #25
0
ファイル: youban.py プロジェクト: vyunwei/spider
 def getdloadurl(self, response):
     item = MyspiderItem()
     storyname = response.xpath(
         "//div[@class='Mp3ErweimaText']/p/span/text()").extract()
     stroyDLK = response.xpath(
         "//div[@class='downloadboxlist']/p/a/@href").extract()
     item['name'] = storyname[0]
     item['linkDX'] = stroyDLK[0]
     item['linkLT'] = stroyDLK[1]
     # print(item)
     return item
コード例 #26
0
 def parse(self, response):
     area_list = response.xpath(
         "//span[@class='elems-l']/div/a[position()>1]")
     for area in area_list:
         item = MyspiderItem()
         item['area_name'] = area.xpath("./text()").extract_first()
         item['area_href'] = area.xpath("./@href").extract_first()
         # print(item)
         yield scrapy.Request(
             item['area_href'],
             callback=self.parse_detail,
             meta={"item": item},
         )
コード例 #27
0
    def parse(self, response):
        # 提取所有老师的节点
        node_list = response.xpath("//div[@class='li_txt']")

        # item_list = []
        # 迭代每个老师节点,并创建item对象保存信息
        for node in node_list:
            item = MyspiderItem()
            item['name'] = node.xpath("./h3/text()").extract_first()
            item['title'] = node.xpath("./h4/text()").extract_first()
            item['info'] = node.xpath('./p/text()').extract_first()

            yield item
コード例 #28
0
ファイル: itcast.py プロジェクト: mrmcree/py
    def parse(self, response):
        teacher_list = response.xpath('//div[@class="li_txt"]')
        teacherItem = []
        for each in teacher_list:
            item = MyspiderItem()
            name = each.xpath('./h3/text()').extract()
            title = each.xpath('./h4/text()').extract()
            info = each.xpath('./p/text()').extract()

            item['name'] = name[0].encode('gbk')
            item['title'] = title[0].encode('gbk')
            item['info'] = info[0].encode('gbk')

            yield item
コード例 #29
0
    def parse(self, response):
        note_list = response.xpath("//div[@class='li_txt']")
        # items=[]
        for node in note_list:
            item = MyspiderItem()
            name = node.xpath("./h3/text()").extract()
            title = node.xpath("./h4/text()").extract()
            info = node.xpath("./p/text()").extract()

            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]

            yield item
コード例 #30
0
 def parse(self, response):
     # filename="music.html"
     data = response.body.decode(encoding="utf-8", errors="ignore") #获取响应内容  decode()是解码的意思,从html字节码解码成二进制
     # open(filename,"wb").write(data)#写入本地
 #<a href='/play/16315.html' target="play">06_8号交响曲</a>
     pat1 = re.compile(r'<a href=\'.*?\' target="play">(.*?)</a>')
     pat2 = re.compile(r"<a href='(.*?)' target=\"play\"")
     title = re.findall(pat1,data)
     songUrl=re.findall(pat2,data)
     # items=[]
     for  i  in range(0,len(title)):
         item = MyspiderItem()
         item["title"] = title[i]
         item["songUrl"] ="http://www.130v.com/"+ songUrl[i]
         yield item   #每构建一个item生成器就返回给Pipeline 相当多线程啦