Ejemplo n.º 1
0
#coding=utf-8
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy import Request
from education.items import SubjectsItem
from education.libs import mongolib

import sys,pdb

subject_coll = mongolib.get_coll("subject")

class MyShareSpider(Spider):
    name = "subjects"
    allowed_domains = ["*"]
    start_urls  = ["http://127.0.0.1/subject.htm"]

    def parse(self, response):
        sel = Selector(response)
        tables = sel.xpath("//table")

        items = []
        for table in tables:
            trs = table.xpath("tr")[1:]
            for tr in trs:

                item = SubjectsItem()
                item["category"] = ""
                item["first_level"] = {}
                item["second_level"] = []

                tds = tr.xpath("td")
Ejemplo n.º 2
0
#coding=utf-8
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy import Request
from education.items import UniversityItem
from education.libs import mongolib

import sys,pdb

university_coll = mongolib.get_coll("university")

#抓取中国高校名单
class UniversitySpider(Spider):
    name = "university"
    allowed_domains = ["*"]
    start_urls  = ["http://www.chinadegrees.cn/xwyyjsjyxx/xwsytjxx/qgptgxmd/qgptgxmd.html"] #普通高校名单(截至2013年6月21日)

    def parse(self, response):
        sel = Selector(response)
        tbody = sel.xpath("//table[1]/tbody")
        trs = tbody.xpath("tr")

        items = []
        for tr in trs[3:]:
            tds = tr.xpath("td")
            if len(tds) == 1:
                print tds[0].xpath("strong/text()").extract()[0]
            else:
                item = UniversityItem()
                item["name"] = tds[1].xpath("text()").extract()[0]
                item["department"] = tds[2].xpath("text()").extract()[0]
Ejemplo n.º 3
0
#coding=utf-8
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy import Request
from education.items import BookItem
from education.libs import mongolib

import sys,pdb
import urllib
import urllib2
import cookielib
import bs4

book_coll = mongolib.get_coll("book")

#抓取教材
class UniversitySpider(Spider):
    name = "book"
    allowed_domains = ["edu.cn"]
    start_urls  = ["http://www.tbook.edu.cn/ListExamBook2.action"]

    def parse(self, response):
        inputData = {}
        desc_urls = []
        for i in range(0,106):
            inputData = {"pageNo": i}
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
            postdata = urllib.urlencode(inputData)
            result2 = opener.open("http://www.tbook.edu.cn/ListExamBook2.action", postdata)
            soup = bs4.BeautifulSoup(result2, "html.parser")
            trs = soup.find_all("tr")[2:]