def parse(self, response): # 得到锚文本 teacherItems = response.xpath( '//table[@class ="wp_article_list_table"]') # 获取每位老师具体介绍页面链接锚文本 nexturls = teacherItems.xpath('.//span[@class="Article_MicroImage"]') # 输出链接数据 file = open('../docs/%s/index.txt' % self.name, 'a', encoding='utf-8') for urlt in nexturls: print(urlt.get()) file.write( urlt.xpath('.//a/@title').get() + "," + urlt.xpath(".//a/@href").get() + "," + "南开大学文学院" + "," + response.url + '\n') # 保存锚文本 m_f = open('../docs/%s/m_text/%s_m.txt' % (self.name, urlt.xpath('.//a/@title').get()), 'w', encoding='utf-8') m_f.write(str(urlt.get())) m_f.close() #递归回调函数保存图片 item = TeacherinfoItem() item['image_name'] = urlt.xpath('.//a/@title').get() item['image_url'] = self.baseurl + urlt.xpath('.//img/@src').get() print(item['image_name'], item['image_url']) request = scrapy.Request(url=item['image_url'], callback=self.parseImg) request.meta['item'] = item yield request # 递归回调解析教师信息的解析器 yield scrapy.Request(url=urlt.xpath(".//a/@href").get(), callback=self.parseTeacher) file.close()
def parseTeacher(self, response): #/html/body/div[3]/div/div/div data = response.meta['item'] # 保存网页的主体内容 details = response.xpath('//div[@portletmode="simpleArticleAttri"]') filename = str( details.xpath('.//div[@class="name"]/text()').get()).replace( '\n', '').replace(' ', '').replace('\r', '') f = open('../docs/%s/%s.txt' % (self.name, filename), 'w', encoding='utf-8') # 保存个人主页信息文件 f.write(filename + '\n') for item in details.xpath('.//div[@class = "lxfs-info"]').xpath( './/div[@class="info"]'): #print(item) for text in item.xpath('.//text()').getall(): f.write( str(text).replace('\n', '').replace(' ', '').replace('\r', '')) f.write('\n') for item in details.xpath( './/div[@class="layui-tab layui-tab-brief"]'): #print(item) for text in item.xpath('.//text()').getall(): f.write( str(text).replace('\n', '').replace(' ', '').replace('\r', '')) f.write('\n') f.close() # 存儲教师姓名和网址映射信息 file = open('../docs/%s/index.txt' % self.name, 'a', encoding='utf-8') # 存储基本信息索引,姓名,网页链接,学院,父链接 file.write(filename + "," + response.url + ',' + data["xueyuan"] + "," + data['parentUrl'] + '\n') file.close() # 保存锚文本 m_f = open('../docs/%s/m_text/%s_m.txt' % (self.name, filename), 'w', encoding='utf-8') m_f.write(str(data["m_text"])) m_f.close() # 保存网页快照 with open('%s/%s/%s.html' % (snapshots_path, self.name, filename), 'wb') as s_f: s_f.write(response.body) # 递归回调函数保存图片 imgurl = details.xpath('.//img/@src').get() item = TeacherinfoItem() item['image_name'] = filename item['image_url'] = self.baseurl + imgurl request = scrapy.Request(url=item['image_url'], callback=self.parseImg) request.meta['item'] = item yield request
def parseTeacher2(self, response): # /html/body/div[3]/div/div/div details = response.xpath('.//div[@class="jz_li_div"]') for temp in details: filename = temp.xpath('.//h3/text()').get() # 保存网页快照 with open('%s/%s/%s.html' % (snapshots_path, self.name, filename), 'wb') as s_f: s_f.write(response.body) print(filename) f = open('../docs/%s/%s.txt' % (self.name, filename), 'w', encoding='utf-8') f.write(filename + '\n') imgurl = temp.xpath('.//img/@src').get() temp = temp.xpath('.//div[@class="jz_li_content"]') for item in temp.css('p'): print(item) for text in item.xpath('.//text()').getall(): f.write( str(text).replace('\n', '').replace(' ', '').replace('\r', '')) f.write('\n') f.close() # 存儲映射信息 file = open('../docs/%s/index.txt' % self.name, 'a', encoding='utf-8') file.write(filename + "," + response.url + "," + "南开大学经济学院" + "," + response.url + '\n') file.close() # 保存锚文本 m_f = open('../docs/%s/m_text/%s_m.txt' % (self.name, filename), 'w', encoding='utf-8') m_f.write(str(temp.get())) m_f.close() # 递归回调函数保存图片 if imgurl is None: continue print(imgurl) item = TeacherinfoItem() item['image_name'] = filename item['image_url'] = self.baseurl + imgurl # print(item['image_name'], item['image_url']) request = scrapy.Request(url=item['image_url'], callback=self.parseImg) request.meta['item'] = item yield request
def parseTeacher(self, response): # /html/body/div[3]/div/div/div data = response.meta['item'] details = response.xpath('.//div[@frag="面板21"]') filename = details.xpath('.//table[1]/tr[1]/td/text()').get() print(filename) f = open('../docs/%s/%s.txt' % (self.name, filename), 'w', encoding='utf-8') f.write(filename + '\n') details = details.xpath('.//table[4]').xpath(".//table") for item in details.css('p'): print(item) for text in item.xpath('.//text()').getall(): f.write( str(text).replace('\n', '').replace(' ', '').replace('\r', '')) f.write('\n') f.close() # 存儲映射信息 file = open('../docs/%s/index.txt' % self.name, 'a', encoding='utf-8') file.write(filename + "," + response.url + ',' + data["xueyuan"] + "," + data['parentUrl'] + '\n') file.close() # 保存锚文本 m_f = open('../docs/%s/m_text/%s_m.txt' % (self.name, filename), 'w', encoding='utf-8') m_f.write(str(data["m_text"])) m_f.close() # 保存网页快照 with open('%s/%s/%s.html' % (snapshots_path, self.name, filename), 'wb') as s_f: s_f.write(response.body) # 递归回调函数保存图片 imgurl = details.xpath('.//td[@class="MsoNormal STYLE1"]').xpath( './/img/@src').get() print(imgurl) item = TeacherinfoItem() item['image_name'] = filename item['image_url'] = self.baseurl + imgurl # print(item['image_name'], item['image_url']) request = scrapy.Request(url=item['image_url'], callback=self.parseImg) request.meta['item'] = item yield request
def parse(self, response): # 存放老师信息的集合 items = [] print(response.body) for each in response.xpath("//div[@class='li_txt']"): # 将我们得到的数据封装到一个 `ItcastItem` 对象 item = TeacherinfoItem() # extract()方法返回的都是unicode字符串 name = each.xpath("h3/text()").extract() position = each.xpath("h4/text()").extract() info = each.xpath("p/text()").extract() # xpath返回的是包含一个元素的列表 item['name'] = name[0] item['position'] = position[0] item['info'] = info[0] items.append(item) yield item
def parseTeacher(self, response): #/html/body/div[3]/div/div/div data = response.meta['item'] details = response.xpath( '//div[@class="leader-list leader cl"]/div[@class="wp_articlecontent"]' ) splitwords = re.findall( r'[\u4e00-\u9fa5]*|[a-zA-Z]*', str( details.xpath( './/div[@class = "info"]/div[@class="name"]/text()').get()) ) filename = '' for p in [x for x in splitwords if len(x) >= 1]: filename = filename + p f = open('../docs/%s/%s.txt' % (self.name, filename), 'w', encoding='utf-8') f.write(filename + '\n') for item in details.xpath('.//div[@class = "info"]').xpath( './/div[@class="label-value"]'): print(item) for text in item.xpath('.//text()').getall(): f.write( str(text).replace('\n', '').replace(' ', '').replace('\r', '')) f.write('\n') for item in details.xpath('.//div[@id="tabsDiv"]').css('p'): print(item) for text in item.xpath('.//text()').getall(): f.write( str(text).replace('\n', '').replace(' ', '').replace('\r', '')) f.write('\n') f.close() # 存儲映射信息 file = open('../docs/%s/index.txt' % self.name, 'a', encoding='utf-8') file.write(filename + "," + response.url + ',' + data["xueyuan"] + "," + data['parentUrl'] + '\n') file.close() # 保存网页快照 with open('%s/%s/%s.html' % (snapshots_path, self.name, filename), 'wb') as s_f: s_f.write(response.body) # 保存锚文本 m_f = open('../docs/%s/m_text/%s_m.txt' % (self.name, filename), 'w', encoding='utf-8') m_f.write(str(data["m_text"])) m_f.close() # 递归回调函数保存图片 imgurl = details.xpath('.//img/@src').get() item = TeacherinfoItem() item['image_name'] = filename item['image_url'] = self.baseurl + imgurl #print(item['image_name'], item['image_url']) request = scrapy.Request(url=item['image_url'], callback=self.parseImg) request.meta['item'] = item yield request