def parse(self, response): # with open("teacher.html", 'wb') as f: # f.write(response.body) # 读取响应文件内容 # 所有老师列表集合 teacherItem = [] for each in response.xpath('//div[@class = "li_txt"]'): # 将我们得到的数据封装到一个 `MyspiderItem` 对象 item = ItcastItem() # 通过extract()转换为unicode字符串 # 不加extract()就是xpath匹配的对象而已 name = each.xpath( './h3/text()').extract() # xpath返回的都是列表,元素根据匹配规则来(e.g. text()) title = each.xpath('./h4/text()').extract() info = each.xpath('./p/text()').extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] teacherItem.append(item) # 直接返回数据,用于保存类型 return teacherItem
def parse_detail(self, response): res = BeautifulSoup(response.body) itcastItem = ItcastItem() itcastItem['title'] = res.select('h1')[0].text itcastItem['content'] = res.select('p')[0].text itcastItem['time'] = res.select('.ndArticle_creat')[0].text return itcastItem
def parse(self, response): node_list = response.xpath("//div[@class='li_txt']") # 存放老师信息的列表 # items = [] for node in node_list: # 创建一个Item对象,用来存储信息 item = ItcastItem() #extract()方法是将node.xpath()返回的对象转换为unicode字符串 names = node.xpath("./h3/text()").extract() titles = node.xpath("./h4/text()").extract() infos = node.xpath("./p/text()").extract() #xpath() 返回的是一个列表,需要通过索引[0]访问。 #print("name:%s"%names) #print("title:%s:"%titles) item['name'] = names[0] item['title'] = titles[0] item['info'] = infos[0] #items.append(item) #将获取的数据交给管道pipeline,实时的处理数据,不用在内存堆积。 yield item
def parse(self, response): node_list = response.xpath("//div[@class='li_txt']") for node in node_list: item = ItcastItem() item['name'] = node.xpath("./h3/text()").extract()[0] item['title'] = node.xpath("./h4/text()").extract()[0] item['info'] = node.xpath("./p/text()").extract()[0] yield item
def parse(self, response): node_list = response.xpath("//div[@class='li_txt']") for node in node_list: item = ItcastItem() name = node.xpath("./h3/text()").extract_first() title = node.xpath("./h4/text()").extract_first() info = node.xpath("./p/text()").extract_first() print(name, title, info) item['name'] = name item['title'] = title item['info'] = info yield item
def parse(self, response): li_txt = response.xpath("//div[@class='li_txt']") for node in li_txt: # 创建item字段,用来存储数据 item = ItcastItem() item["name"] = node.xpath("./h3/text()").get() item["title"] = node.xpath("./h4/text()").get() item["info"] = node.xpath("./p/text()").get() # 将数据返回给 pipelines管道 yield item
def parse(self, response): # print(response.body.decode('utf-8')) node_list = response.xpath('//*[@class ="main_mask"]') # items=[] for node in node_list: # 创建一个item对象 item = ItcastItem() name = node.xpath('./h2/text()').extract() title = node.xpath('./h2/span/text()').extract() info = node.xpath('./p/text()').extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0].strip() yield item
def parse(self, response): #pass #print(response.body) node_list = response.xpath("//div[@class='li_txt']") for node in node_list: item = ItcastItem() name = node.xpath("./h3/text()").extract() title = node.xpath("./h4/text()").extract() info = node.xpath("./p/text()").extract() item['name'] = str(name[0]) item['title'] = str(title[0]) item['info'] = str(info[0]) yield item
def parse(self, response): node_list = response.xpath("//div[@class='li_txt']") #items = [] for node in node_list: item = ItcastItem() name = node.xpath("./h3/text()").extract() title = node.xpath("./h4/text()").extract() info = node.xpath("./p/text()").extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] #items.append(item) yield item
def parse(self, response): # items = [] node_list = response.xpath("//div[@class='li_txt']") for node in node_list: item = ItcastItem() name = node.xpath("./h3/text()").extract() title = node.xpath("./h4/text()").extract() info = node.xpath("./p/text()").extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] # 返回提取到的每个 item 数据,给管道文件处理,同时返回回来,继续执行后面的代码(循环) yield item
def parse(self, response): node_list = response.xpath("//div[@class='li_txt']") for node in node_list: # 创建item字段对象,用来存储信息 item = ItcastItem() # .extract() 将xpath对象转换为Unicode字符串 name = node.xpath("./h3/text()").extract() title = node.xpath("./h4/text()").extract() info = node.xpath("./p/text()").extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] # 返回提取到的每一个item数据 给管道文件处理,同时还会回来继续执行后面的代码 yield item
def parse(self, response): node_list = response.xpath('//div[@class="li_txt"]') #items = [] for node in node_list: item = ItcastItem() item['name'] = node.xpath( './h3/text()').extract_first() #.extract 作用是将xpath对象转换成为text item['title'] = node.xpath('./h4/text()').extract_first() item['info'] = node.xpath('./p/text()').extract_first() #返回给管道 #return item #给引擎继续爬取 #return scrapy.Request(url) #items.append(item) #返回数据给管道,之后会继续执行 yield item
def parse(self, response): # print(response) node_list = response.xpath('//div[@class="li_txt"]') for node in node_list: item = ItcastItem() # extract()方法将xpath对象转换为Unicode字符串 name = node.xpath('./h3/text()').extract() title = node.xpath('./h4/text()').extract() info = node.xpath('./p/text()').extract() #xpath取出的是list,转换成str需取下标 item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] yield item
def parse(self, response): node_list = response.xpath("//div[@class='li_txt']") # 用来存储所有的item字段 for node in node_list: #创建item字段对象,用来存储信息 item = ItcastItem() #.extract()将xpath对象转换为unicode字符串 name = node.xpath("./h3/text()").extract() title = node.xpath("./h4/text()").extract() info = node.xpath("./p/text()").extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] #返回提取到的每个item数据给管道pipeline文件处理,同时还会继续回来执行后面的代码即for循环 yield item print(response.body)
def parse(self, response): # 打印响应源码 # print response.body node_list = response.xpath("//div[@class='li_txt']") #用来存储所有的item字段 for node in node_list: #创建item字段对象,用来存储信息 item = ItcastItem() # .extract()将xpath对象转换为unicode字符串(与外面稍微有点不一样) name = node.xpath("./h3/text()").extract() title = node.xpath("./h4/text()").extract() info = node.xpath("./p/text()").extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] # 返回给管道,处理一条,返回一条 yield item
def parse(self, response): # node_list = response.xpath("//div[@class='li_txt']") # 使用 xpath node_list = response.css(".li_txt") for node in node_list: item = ItcastItem() # 使用 xpath # item['name'] = node.xpath('./h3/text()').extract()[0] # item['title'] = node.xpath('./h4/text()').extract()[0] # item['info'] = node.xpath('./p/text()').extract()[0] # 使用 css 选择器 item['name'] = node.css('h3::text').extract()[0] item['title'] = node.css('h4::text').extract()[0] item['info'] = node.css('p::text').extract()[0] print(item['name']) print(item['title']) print(item['info']) yield
def parse(self, response): # 节点列表 node_list = response.xpath("//div[@class='li_txt']") #用来存储所有的item字段的 items = [] for node in node_list: # 创建item字段对象,用来存储信息 item = ItcastItem() # .extract() 将xpath对象转换为Unicode字符串 name = node.xpath("./h3/text()").extract() title = node.xpath("./h4/text()").extract() info = node.xpath("./p/text()").extract() item["name"] = name[0] item["title"] = title[0] item["info"] = info[0] items.append(item) return items
def parse(self, response): print('djh:ItcastSpider...parse()') node_list = response.xpath("//div[@class='li_txt']") items = [] for node in node_list[:5]: # 创建item对象 item = ItcastItem() # 老师姓名 item['name'] = node.xpath("./h3/text()").extract()[ 0] # xpath对象转list # 老师职称 item['title'] = node.xpath("./h4/text()").extract()[ 0] # xpath对象转list # 老师信息 item['info'] = node.xpath("./p/text()").extract()[ 0] # xpath对象转list items.append(item) return items
def parse(self, response): # print (response.body) node_list = response.xpath("//div[@class='main_mask']") # Store all the item # items = [] for node in node_list: # Create item for storing data item = ItcastItem() # .extract() change xpath object to Unicode String name = node.xpath("./h2/text()").extract() title = node.xpath("./h2/span/text()").extract() info = node.xpath("./p/text()").extract() item['name'] = (name[0]) item['title'] = (title[0]) item['info'] = (info[0]) # items.append(item) # Return every item data, and Transfer data to pipelines # Meanwhile go back an execute the code followed yield item
def parse(self, response): node_list = response.xpath("//div[@class='li_txt']") #节点 # items = [] #item列表,用来存储所有的item字段的,后面用到了管道文件所以不用了,不能写到for里面不然只能拿到返回的最后的一个字段名 for node in node_list: #创建item字段对象,用来存储信息,前面引入了一个类,我们要拿到每个返回的字段,所以就在for下面创建 item = ItcastItem() #node.xpath返回的是xpath对象,.extract()将xpath对象转换为的unicode字符串(作用:提取字段并转换为字符串) name = node.xpath("./h3/text()").extract( ) #等于item['name'] = node.xpath("./h3/text()").extract()[0],这里分开写了 title = node.xpath("./h4/text()").extract() info = node.xpath("./p/text()").extract() #xpath返回的是包含一个元素的列表 # print name[0] # print title[0] # print info[0] item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] # items.append(item) #将返回的item字段放到列表中 #返回提取到的每个item数据,给管道文件处理,下次执行时会返回到此处继续执行后面的代码,return是直接执行完就关闭了函数 yield item # 直接返回最后数据 # return items
def parse(self, response): item_list = [] node_list = response.xpath( "//div[@id='shiti-content']/div[@class='shiti']") # print("hello world!") # print(response.xpath("//div[@class='shiti-content']/div[@class='shiti']")) answer_list = response.xpath( "//div[@class='shiti-content']/span/text()").extract() # print(len(node_list)) # print(len(answer_list)) for i in range(len(node_list)): item = ItcastItem() option = {} # .extract()将xpath对象转化为Unicode字符串 name = node_list[i].xpath("./h3/text()").extract()[0] option_list = node_list[i].xpath("./ul/li/label/text()").extract() # print(type(option_list)) for op in range(len(option_list)): option[op] = option_list[op] # print(type(option)) answer = answer_list[i] # print(name) item["name"] = name # print(option) item["option"] = option # print(answer) item["answer"] = answer # return item # item_list.append(item) # 获取url传给调度器 # return scrapy.Request(url) #将获取数据交给 pipelines 避免把所有数据item放入item_list占用大量内存的情况 #返回数据给管道,处理完毕后再回来取数据 yield item # 返回值就传给引擎 # return item_list #单库爬寻 # if self.offset < 76: # self.offset += 1 # url = self.baseURL+str(self.offset) # print(url) # yield scrapy.Request(url,callback = self.parse) #拼接方式:多库爬寻 #当第一个库爬寻完毕,爬寻下一个库 print(self.examLib[self.exam_list[self.KuNumber]][1]) print(len(self.exam_list)) print(self.KuNumber) if (self.offset < self.examLib[self.exam_list[self.KuNumber]][1] and self.KuNumber < len(self.exam_list)): self.offset += 1 url = self.baseURL + 'tikubh=' + str(self.examLib[self.exam_list[ self.KuNumber]][0]) + '&page=' + str(self.offset) print(url) yield scrapy.Request(url, callback=self.parse) elif self.KuNumber < len(self.exam_list): self.offset = 1 self.KuNumber += 1 url = self.baseURL + 'tikubh=' + str(self.examLib[self.exam_list[ self.KuNumber]][0]) + '&page=' + str(self.offset) print(url) yield scrapy.Request(url, callback=self.parse)