def parse(self, response): index = 118 # / html / body / div[5] / div / div[2] / div[2] / ul / li[1] # print(response.xpath('/html/body/div/div/div[2]/div[2]/ul[1]/li[2]/div/a/h3')) # ' / html / body / div / div / div[2] / div[2] / ul[1] / li[2] / div / a / h3 / strong' # print(response.xpath('/html/body/form/div[3]/table/tr/td/table/tr[3]/td/table/tr[3]/td/table/tr/td[3]/div/span[2]/div/table[1]/tr[1]')) # '/html/body/div[2]/table/tbody/tr[4]/td/table/tbody/tr[1]/td[1]/a/span' for i in range(1, 2): if i == 8: continue # print(response.xpath('/html/body/section/div[1]/div[5]/ul/li[1]')) for line in response.xpath( '/html/body/div[1]/div[3]/div/div[1]/div/div/div[2]/div[2]/div[2]/div[2]/div[2]/ul/li' ): # 教育信息爬取 print(1) # print(1) item = eduItem() item['mid'] = index # print(item['mid']) # '/html/body/div[3]/div/div/ul/li[17]/a' # '/html/body/div[3]/div/div[3]/div[1]/ul/li[1]/div[3]/a[1]/h3' # '/td/table/tbody/tr/td[1]/a' # '/html/body/div[1]/div[8]/div/div/div[2]/div[2]/div[2]/table/tbody/tr/td[2]/div[1]/div[2]/div[3]/div[2]/div/div[1]/div[1]/table/tbody/tr/td/a' # '/html/body/div[1]/div[8]/div/div/div[2]/div[2]/div[2]/table/tbody/tr/td[2]/div[1]/div[2]/div[3]/div[2]/div/div[1]/div[1]/table/tbody/tr/td/a/text()' item['name'] = line.xpath('./a/text()').extract()[0].strip() # '/html/body/div[6]/div[4]/div[1]/div[9]/div/table/tbody/tr[1]/td/table[9]/tbody/tr[1]/td[2]/a/b' print(item['name']) # print(line.xpath('./h2/a/@href').extract()) # '/html/body/div[2]/table/tbody/tr[4]/td/table/tbody/tr[1]/td[1]/a/span' # '/html/body/div[4]/div[2]/div[2]/p[1]' item['url'] = line.xpath('./a/@href').extract()[0] print(item['url']) # item['details'] = line.xpath('./span/text()').extract()[0].strip() yield item
def parse(self, response): index = 72 # / html / body / div[5] / div / div[2] / div[2] / ul / li[1] # print(response.xpath('/html/body/div/div/div[2]/div[2]/ul[1]/li[2]/div/a/h3')) # ' / html / body / div / div / div[2] / div[2] / ul[1] / li[2] / div / a / h3 / strong' # print(response.xpath('/html/body/form/div[3]/table/tr/td/table/tr[3]/td/table/tr[3]/td/table/tr/td[3]/div/span[2]/div/table[1]/tr[1]')) # '/html/body/div[2]/table/tbody/tr[4]/td/table/tbody/tr[1]/td[1]/a/span' for i in range(1, 2): for line in response.xpath( '/html/body/div[2]/div[3]/div/div[1]/ul/li'): # 教育信息爬取 # print(1) item = eduItem() item['mid'] = index # print(item['mid']) # '/html/body/div[3]/div/div/ul/li[17]/a' # '/html/body/div[3]/div/div[3]/div[1]/ul/li[1]/div[3]/a[1]/h3' # '/td/table/tbody/tr/td[1]/a' # '/html/body/div[1]/div[8]/div/div/div[2]/div[2]/div[2]/table/tbody/tr/td[2]/div[1]/div[2]/div[3]/div[2]/div/div[1]/div[1]/table/tbody/tr/td/a' # '/html/body/div[1]/div[8]/div/div/div[2]/div[2]/div[2]/table/tbody/tr/td[2]/div[1]/div[2]/div[3]/div[2]/div/div[1]/div[1]/table/tbody/tr/td/a/text()' item['name'] = line.xpath('./a/text()').extract()[0].strip() print(item['name']) # print(line.xpath('./h2/a/@href').extract()) # '/html/body/div[2]/table/tbody/tr[4]/td/table/tbody/tr[1]/td[1]/a/span' item['url'] = "http://www.ytmuseum.com" + line.xpath( './a/@href').extract()[0] print(item['url']) # item['details'] = line.xpath('./span/text()').extract()[0].strip() yield item
def parse_details(self,response): item = eduItem() item['mid']= response.meta['mid'] item['name'] = response.meta['name'] item['url'] = response.meta['url'] item['details'] = response.xpath('/html/body/div[2]/div[2]/div[4]/div/div/p[1]/text()').extract()[0] yield item
def parse(self, response): index = 42 # / html / body / div[5] / div / div[2] / div[2] / ul / li[1] # print(response.xpath('/html/body/div/div/div[2]/div[2]/ul[1]/li[2]/div/a/h3')) # ' / html / body / div / div / div[2] / div[2] / ul[1] / li[2] / div / a / h3 / strong' print( response.xpath( '/html/body/form/div[3]/table/tr/td/table/tr[3]/td/table/tr[3]/td/table/tr/td[3]/div/span[2]/div/table[1]/tr[1]' )) for line in response.xpath( '/html/body/form/div[3]/table/tr/td/table/tr[3]/td/table/tr[3]/td/table/tr/td[3]/div/span[2]/div/table[1]/tr[1]' ): # 教育信息爬取 # print(1) item = eduItem() item['mid'] = index # print(item['mid']) # '/html/body/div[3]/div/div/ul/li[17]/a' item['name'] = line.xpath( './td/table/tr/td[2]/a/text()').extract()[0] print(item['name']) # print(line.xpath('./h2/a/@href').extract()) item['url'] = "http://www.cyjng.net" + line.xpath( './td/table/tr/td[2]/a/@href').extract()[0] print(item['url']) # item['details'] = line.xpath('./span/text()').extract()[0].strip() yield item
def parse(self, response): index = 44 # / html / body / div[5] / div / div[2] / div[2] / ul / li[1] # print(response.xpath('/html/body/div/div/div[2]/div[2]/ul[1]/li[2]/div/a/h3')) # ' / html / body / div / div / div[2] / div[2] / ul[1] / li[2] / div / a / h3 / strong' # print(response.xpath('/html/body/form/div[3]/table/tr/td/table/tr[3]/td/table/tr[3]/td/table/tr/td[3]/div/span[2]/div/table[1]/tr[1]')) for line in response.xpath( '/html/body/div[8]/div/div/div/div/div[2]/div/div/div[2]/div/div/div' ): # 教育信息爬取 # print(1) item = eduItem() item['mid'] = index # print(item['mid']) # '/html/body/div[3]/div/div/ul/li[17]/a' # '/html/body/div[8]/div/div/div/div/div[2]/div/div/div[2]/div/div/div[2]/section/div[2]/header/h3/a' item['name'] = line.xpath( './section/div[2]/header/h3/a/text()').extract()[0] print(item['name']) print(line.xpath('./section/div[2]/header/h3/a/@href').extract()) item[ 'url'] = "http://www.19371213.com.cn/learn/programme" + line.xpath( './section/div[2]/header/h3/a/@href').extract()[0][1:] print(item['url']) # item['details'] = line.xpath('./span/text()').extract()[0].strip() yield item
def parse(self, response): index = 70 # / html / body / div[5] / div / div[2] / div[2] / ul / li[1] # print(response.xpath('/html/body/div/div/div[2]/div[2]/ul[1]/li[2]/div/a/h3')) # ' / html / body / div / div / div[2] / div[2] / ul[1] / li[2] / div / a / h3 / strong' # print(response.xpath('/html/body/form/div[3]/table/tr/td/table/tr[3]/td/table/tr[3]/td/table/tr/td[3]/div/span[2]/div/table[1]/tr[1]')) # '/html/body/div[2]/table/tbody/tr[4]/td/table/tbody/tr[1]/td[1]/a/span' for i in range(1, 15, 3): for line in response.xpath( '/html/body/div[2]/table/tbody/tr[4]/td/table/tbody/tr[{}]' .format(i)): # 教育信息爬取 # print(1) item = eduItem() item['mid'] = index # print(item['mid']) # '/html/body/div[3]/div/div/ul/li[17]/a' # '/html/body/div[3]/div/div[3]/div[1]/ul/li[1]/div[3]/a[1]/h3' # '/td/table/tbody/tr/td[1]/a' # '/html/body/div[1]/div[8]/div/div/div[2]/div[2]/div[2]/table/tbody/tr/td[2]/div[1]/div[2]/div[3]/div[2]/div/div[1]/div[1]/table/tbody/tr/td/a' # '/html/body/div[1]/div[8]/div/div/div[2]/div[2]/div[2]/table/tbody/tr/td[2]/div[1]/div[2]/div[3]/div[2]/div/div[1]/div[1]/table/tbody/tr/td/a/text()' item['name'] = line.xpath( './td[1]/a/span/text()').extract()[0].strip() print(item['name']) # print(line.xpath('./h2/a/@href').extract()) # '/html/body/div[2]/table/tbody/tr[4]/td/table/tbody/tr[1]/td[1]/a/span' item[ 'url'] = "http://bowuguan.qingzhou.gov.cn/jy/sjhd/tsjy" + line.xpath( './td[1]/a/@href').extract()[0] print(item['url']) # item['details'] = line.xpath('./span/text()').extract()[0].strip() yield item
def parse(self, response): index = 51 # / html / body / div[5] / div / div[2] / div[2] / ul / li[1] # print(response.xpath('/html/body/div/div/div[2]/div[2]/ul[1]/li[2]/div/a/h3')) # ' / html / body / div / div / div[2] / div[2] / ul[1] / li[2] / div / a / h3 / strong' # print(response.xpath('/html/body/form/div[3]/table/tr/td/table/tr[3]/td/table/tr[3]/td/table/tr/td[3]/div/span[2]/div/table[1]/tr[1]')) # for line in response.xpath('/html/body/div/div/div[2]/div/div[4]/div[5]/div[1]/ul/li'): # 教育信息爬取 # print(1) # item = eduItem() # item['mid'] = index # # print(item['mid']) # # '/html/body/div[3]/div/div/ul/li[17]/a' # # '/html/body/div/div/div[2]/div/div[4]/div[5]/div[1]/ul/li[1]/div/div[2]/a/div[1]' # item['name'] = line.xpath('./div/div[2]/a/div[1]/text()').extract()[0] # print(item['name']) # # print(line.xpath('./h2/a/@href').extract()) # item['url'] = "http://www.zmnh.com"+line.xpath('./div/div[2]/a/@href').extract()[0] # print(item['url']) # # item['details'] = line.xpath('./span/text()').extract()[0].strip() # yield item # print(response.text) js = json.loads(response.xpath('/html/body/pre/text()').extract()[0]) for i in js['data']: item = eduItem() item['mid'] = index item['name'] = i['title'] print(item['name']) item[ 'url'] = "http://www.zmnh.com/news/news_info.html?category={}&infotype=8&id={}".format( i['category'], i['id']) print(item['url']) yield item
def parse(self, response): index = 1 rootUrl = 'https://www.dpm.org.cn' for line in response.xpath('//table/tr/td[@width="660"]'): # 教育信息爬取 item = eduItem() item['mid'] = index # print(item['mid']) item['name'] = line.xpath('./a/text()').extract() # print(line.xpath('./a/@href').extract()) item['url'] = rootUrl + line.xpath('./a/@href').extract()[0] yield item
def parse(self, response): index = 4 rootUrl = 'http://www.jb.mil.cn/cyhd/jzhd/' for line in response.xpath( '/html/body/div[4]/div/div[3]/ul/li'): # 教育信息爬取 item = eduItem() item['mid'] = index # print(item['mid']) item['name'] = line.xpath('./a/h3/text()').extract() # print(line.xpath('./a/@href').extract()) item['url'] = rootUrl + line.xpath('./a/@href').extract()[0][1:] yield item
def parse(self, response): index = 8 rootUrl = 'http://www.bmnh.org.cn/' for line in response.xpath('/html/body/div[3]/div[2]/div[2]/p'): # 教育信息爬取 item = eduItem() item['mid'] = index # print(item['mid']) item['name'] = line.xpath('./a/span/text()').extract() # print(line.xpath('./a/@href').extract()) item['url'] = rootUrl+line.xpath('./a/@href').extract()[0] # print(line.xpath('./div[@class="jchg-info"]/a[@class="jchg-name"]/@href').extract()[0]) yield item
def parse(self, response): index = 20 rootUrl = 'http://www.luxunmuseum.com.cn/' for i in range(1,12,2): for line in response.xpath('/html/body/table[4]/tr/td[3]/table/tr/td/table[2]/tr/td/table/tr/td/table/tr/td/table[1]/tr[1]/td{}'.format(i)): # 教育信息爬取 item = eduItem() item['mid'] = index # print(item['mid']) item['name'] = line.xpath('./a/@title').extract() # print(line.xpath('./a/@href').extract()) item['url'] = rootUrl + line.xpath('./div[2]/a/@href').extract()[0] yield item
def parse(self, response): index = 3 rootUrl = 'http://www.gmc.org.cn' for line in response.xpath( '/html/body/div[4]/div/div/div[3]/div[1]/div'): # 教育信息爬取 item = eduItem() item['mid'] = index # print(item['mid']) item['name'] = line.xpath('./a/div[1]/div[1]/text()').extract() # print(line.xpath('./a/div[1]/div[1]/text()').extract()) # print(line.xpath('./a/@href').extract()) item['url'] = rootUrl + line.xpath('./a/@href').extract()[0] yield item
def parse(self, response): index = 21 rootUrl = 'http://www.luxunmuseum.com.cn/' print(1) for line in response.xpath( '/html/body/div[6]/div[2]/div/div[3]/div[1]/div'): # 教育信息爬取 item = eduItem() item['mid'] = index # print(item['mid']) item['name'] = line.xpath('./div[2]/a/text()').extract() # print(line.xpath('./a/@href').extract()) item['url'] = rootUrl + line.xpath('./div[2]/a/@href').extract()[0] yield item
def parse(self, response): index = 22 rootUrl = 'http://www.luxunmuseum.com.cn/' # / html / body / div[5] / div / div[2] / div[2] / ul / li[1] for line in response.xpath('/html/body/div[5]/div/div[2]/div[2]/ul/li'): # 教育信息爬取 print(1) item = eduItem() item['mid'] = index # print(item['mid']) item['name'] = line.xpath('./h2/a/text()').extract() # print(line.xpath('./h2/a/@href').extract()) item['url'] = line.xpath('./h2/a/@href').extract()[0] yield item
def parse(self, response): index = 11 rootUrl = 'http://www.chnmuseum.cn' print(response.xpath('/html/body/div[4]/div/div/div/ul[2]/li[1]')) for line in response.xpath( '/html/body/div[4]/div/div/div/ul[2]/li'): # 教育信息爬取 item = eduItem() item['mid'] = index # print(item['mid']) item['name'] = line.xpath('./a/text()').extract() print(item['name']) # print(line.xpath('./a/@href').extract()) item['url'] = rootUrl + line.xpath('./a/@href').extract()[0][1:] print(item['url']) # print(line.xpath('./div[@class="jchg-info"]/a[@class="jchg-name"]/@href').extract()[0]) yield item
def parse(self, response): index = 9 rootUrl = 'http://www.1937china.com/' print(response.xpath('/html/body/div[3]/div[2]/div')) for line in response.xpath( '/html/body/div[3]/div[2]/div/div[2]/div/ul/li'): # 教育信息爬取 item = eduItem() item['mid'] = index # print(item['mid']) item['name'] = line.xpath('./a/p/text()').extract() print(item['name']) # print(line.xpath('./a/@href').extract()) item['url'] = line.xpath('./a/@href').extract()[0] print(item['url']) # print(line.xpath('./div[@class="jchg-info"]/a[@class="jchg-name"]/@href').extract()[0]) yield item
def parse(self, response): index = 2 rootUrl = 'http://cstm.cdstm.cn/' for line in response.xpath( '/html/body/div[1]/div/div[@class="jchg-cont"]'): # 教育信息爬取 item = eduItem() item['mid'] = index # print(item['mid']) item['name'] = line.xpath( './div[@class="jchg-info"]/a[@class="jchg-name"]/text()' ).extract() # print(line.xpath('./a/@href').extract()) item['url'] = line.xpath( './div[@class="jchg-info"]/a[@class="jchg-name"]/@href' ).extract()[0] # print(line.xpath('./div[@class="jchg-info"]/a[@class="jchg-name"]/@href').extract()[0]) yield item
def parse(self, response): index = 31 # / html / body / div[5] / div / div[2] / div[2] / ul / li[1] # print(response.xpath('/html/body/div/div/div[2]/div[2]/ul[1]/li[2]/div/a/h3')) # ' / html / body / div / div / div[2] / div[2] / ul[1] / li[2] / div / a / h3 / strong' for line in response.xpath('/html/body/div[2]/div[2]/div[3]/ul/li'): # 教育信息爬取 # print(1) item = eduItem() item['mid'] = index # print(item['mid']) # '/html/body/div[3]/div/div/ul/li[17]/a' item['name'] = line.xpath('./a/text()').extract()[0] print(item['name']) # print(line.xpath('./h2/a/@href').extract()) item['url'] = "http://www.jlmuseum.org" + line.xpath('./a/@href').extract()[0] print(item['url']) item['details'] = line.xpath('./span/text()').extract()[0].strip() yield item
def parse(self, response): index = 15 rootUrl = 'http://www.tjbwg.com/cn/' # print(response.xpath('/html/body/div[4]/div/div/div/ul[2]/li[1]')) for line in response.xpath( '/html/body/div/div[3]/div/div/div[2]/div[2]/div/div/ul/li' ): # 教育信息爬取 item = eduItem() item['mid'] = index # print(item['mid']) item['name'] = line.xpath( './a/div[@class="text"]/h3/text()').extract()[0].strip() print(item['name']) # print(line.xpath('./a/@href').extract()) item['url'] = rootUrl + line.xpath('./a/@href').extract()[0] print(item['url']) # print(line.xpath('./div[@class="jchg-info"]/a[@class="jchg-name"]/@href').extract()[0]) yield item
def parse(self, response): index = 23 # / html / body / div[5] / div / div[2] / div[2] / ul / li[1] # print(response.xpath('/html/body/div/div/div[2]/div[2]/ul[1]/li[2]/div/a/h3')) # ' / html / body / div / div / div[2] / div[2] / ul[1] / li[2] / div / a / h3 / strong' for i in (1, 5): for line in response.xpath( '/html/body/div/div/div[2]/div[2]/ul[{}]/li'.format( i)): # 教育信息爬取 # print(1) item = eduItem() item['mid'] = index # print(item['mid']) item['name'] = line.xpath('./div/a/h3/strong/text()').extract() print(item['name']) # print(line.xpath('./h2/a/@href').extract()) item['url'] = "http://www.balujun.cn" + line.xpath( './div/a/@href').extract()[0] print(item['url']) yield item
def parse(self, response): index = 25 # / html / body / div[5] / div / div[2] / div[2] / ul / li[1] # print(response.xpath('/html/body/div/div/div[2]/div[2]/ul[1]/li[2]/div/a/h3')) # ' / html / body / div / div / div[2] / div[2] / ul[1] / li[2] / div / a / h3 / strong' for line in response.xpath( '/html/body/div[2]/div[2]/div[2]/ul/li'): # 教育信息爬取 # print(1) item = eduItem() item['mid'] = index # print(item['mid']) item['name'] = line.xpath('./span[1]/a/text()').extract()[0] print(item['name']) # print(line.xpath('./h2/a/@href').extract()) item[ 'url'] = "http://ordosbwg.org.cn/shjy_121919/sjhd" + line.xpath( './span[1]/a/@href').extract()[0][1:] print(item['url']) item['details'] = line.xpath('./span[2]/text()').extract()[0] yield item
def parse(self, response): index = 59 # / html / body / div[5] / div / div[2] / div[2] / ul / li[1] # print(response.xpath('/html/body/div/div/div[2]/div[2]/ul[1]/li[2]/div/a/h3')) # ' / html / body / div / div / div[2] / div[2] / ul[1] / li[2] / div / a / h3 / strong' # print(response.xpath('/html/body/form/div[3]/table/tr/td/table/tr[3]/td/table/tr[3]/td/table/tr/td[3]/div/span[2]/div/table[1]/tr[1]')) for line in response.xpath( '/html/body/div[4]/div/div[2]/div[2]/ul/li'): # 教育信息爬取 # print(1) item = eduItem() item['mid'] = index # print(item['mid']) # '/html/body/div[3]/div/div/ul/li[17]/a' # '/html/body/div[3]/div/div[3]/div[1]/ul/li[1]/div[3]/a[1]/h3' item['name'] = line.xpath('./h1/text()').extract()[0].replace( '\t', '').replace('\n', '') print(item['name']) # print(line.xpath('./h2/a/@href').extract()) item['url'] = line.xpath('./p/a/@href').extract()[0] print(item['url']) # item['details'] = line.xpath('./span/text()').extract()[0].strip() yield item
def parse(self, response): index = 37 # / html / body / div[5] / div / div[2] / div[2] / ul / li[1] # print(response.xpath('/html/body/div/div/div[2]/div[2]/ul[1]/li[2]/div/a/h3')) # ' / html / body / div / div / div[2] / div[2] / ul[1] / li[2] / div / a / h3 / strong' for i in range(2, 28, 2): for line in response.xpath( '/html/body/table[3]/tr/td/table/tr/td[2]/table/tr[2]/td/div/table/tr[{}]' .format(i)): # 教育信息爬取 # print(1) item = eduItem() item['mid'] = index # print(item['mid']) # '/html/body/div[3]/div/div/ul/li[17]/a' item['name'] = line.xpath('./td/a/font/text()').extract()[0] print(item['name']) # print(line.xpath('./h2/a/@href').extract()) item['url'] = "http://www.dqsbwg.com/" + line.xpath( './td/a/@href').extract()[0] print(item['url']) # item['details'] = line.xpath('./li/span/text()').extract()[0].strip() yield item
def parse(self, response): index = 38 # / html / body / div[5] / div / div[2] / div[2] / ul / li[1] # print(response.xpath('/html/body/div/div/div[2]/div[2]/ul[1]/li[2]/div/a/h3')) # ' / html / body / div / div / div[2] / div[2] / ul[1] / li[2] / div / a / h3 / strong' for line in response.xpath( '/html/body/div[6]/div[2]/div[1]/ul/li'): # 教育信息爬取 # print(1) item = eduItem() item['mid'] = index # print(item['mid']) # '/html/body/div[3]/div/div/ul/li[17]/a' item['name'] = line.xpath( './div[2]/div[1]/dl/dt/text()').extract()[0] print(item['name']) # print(line.xpath('./h2/a/@href').extract()) item[ 'url'] = "https://www.shanghaimuseum.net/education/" + line.xpath( './div[1]/div[2]/a/@href').extract()[0] print(item['url']) item['details'] = line.xpath( './div[2]/div[1]/dl/dd[1]/p/text()').extract()[0].strip() yield item
def parse(self, response): index = 29 # / html / body / div[5] / div / div[2] / div[2] / ul / li[1] # print(response.xpath('/html/body/div/div/div[2]/div[2]/ul[1]/li[2]/div/a/h3')) # ' / html / body / div / div / div[2] / div[2] / ul[1] / li[2] / div / a / h3 / strong' for line in response.xpath( '/html/body/div[1]/div[4]/div[1]/div/div[2]/div/div[2]/div/div/div[2]/div/div/div[1]/ul/li[@class="content column-num1"]' ): # 教育信息爬取 # print(1) item = eduItem() item['mid'] = index # print(item['mid']) item['name'] = line.xpath( './div/ul/li[1]/h3/a/@title').extract()[0] print(item['name']) # print(line.xpath('./h2/a/@href').extract()) item['url'] = "http://www.sypm.org.cn/" + line.xpath( './div/ul/li[1]/h3/a/@href').extract()[0] print(item['url']) # '/ html / body / div[1] / div[4] / div[1] / div / div[2] / div / div[2] / div / div / div[2] / div / div / \ # div[1] / ul / li[1] / div / ul / li[2]' item['details'] = line.xpath( './div/ul/li[2]/text()').extract()[0].strip() yield item