#coding=utf-8 from scrapy.spiders import Spider from scrapy.selector import Selector from scrapy import Request from education.items import SubjectsItem from education.libs import mongolib import sys,pdb subject_coll = mongolib.get_coll("subject") class MyShareSpider(Spider): name = "subjects" allowed_domains = ["*"] start_urls = ["http://127.0.0.1/subject.htm"] def parse(self, response): sel = Selector(response) tables = sel.xpath("//table") items = [] for table in tables: trs = table.xpath("tr")[1:] for tr in trs: item = SubjectsItem() item["category"] = "" item["first_level"] = {} item["second_level"] = [] tds = tr.xpath("td")
#coding=utf-8 from scrapy.spiders import Spider from scrapy.selector import Selector from scrapy import Request from education.items import UniversityItem from education.libs import mongolib import sys,pdb university_coll = mongolib.get_coll("university") #抓取中国高校名单 class UniversitySpider(Spider): name = "university" allowed_domains = ["*"] start_urls = ["http://www.chinadegrees.cn/xwyyjsjyxx/xwsytjxx/qgptgxmd/qgptgxmd.html"] #普通高校名单(截至2013年6月21日) def parse(self, response): sel = Selector(response) tbody = sel.xpath("//table[1]/tbody") trs = tbody.xpath("tr") items = [] for tr in trs[3:]: tds = tr.xpath("td") if len(tds) == 1: print tds[0].xpath("strong/text()").extract()[0] else: item = UniversityItem() item["name"] = tds[1].xpath("text()").extract()[0] item["department"] = tds[2].xpath("text()").extract()[0]
#coding=utf-8 from scrapy.spiders import Spider from scrapy.selector import Selector from scrapy import Request from education.items import BookItem from education.libs import mongolib import sys,pdb import urllib import urllib2 import cookielib import bs4 book_coll = mongolib.get_coll("book") #抓取教材 class UniversitySpider(Spider): name = "book" allowed_domains = ["edu.cn"] start_urls = ["http://www.tbook.edu.cn/ListExamBook2.action"] def parse(self, response): inputData = {} desc_urls = [] for i in range(0,106): inputData = {"pageNo": i} opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) postdata = urllib.urlencode(inputData) result2 = opener.open("http://www.tbook.edu.cn/ListExamBook2.action", postdata) soup = bs4.BeautifulSoup(result2, "html.parser") trs = soup.find_all("tr")[2:]