Ejemplo n.º 1
0
    def crawl(self, enterUrl, info):
        res = Downloader.getWithBs4(enterUrl)

        text = res.soup.select(".detail-content .text")
        msg = Message(info=info)
        msg.updateInfo({
            "_id": TextUtil.getFirstMatch(res.text, "job_id: '(.*?)',"),
            "job_name": TextUtil.getFirstMatch(res.text, "job_name: '(.*?)',"),
            "job_salary": TextUtil.getFirstMatch(res.text, "job_salary: '(.*?)',"),
            "company": TextUtil.getFirstMatch(res.text, "company:'(.*?)',"),
            "city": TextUtil.getFirstMatch(res.text, "<p>城市:(.*?)<em class=\"vline\">"),
            "work_year": TextUtil.getFirstMatch(res.text, "<\/em>经验:(.*?)<em class=\"vline\">"),
            "pubDate": DateTimeUtil.getStandardDatetime(DateTimeUtil.guess(
                    TextUtil.getFirstMatch(res.text, "\"pubDate\": \"(\d+-\d+-\d+T\d+:\d+:\d+)\","))),
            "upDate": DateTimeUtil.getCurStandardDate(),
            "hr": res.soup.select_one("h2.name").text,

            "job_description": text[0].text.strip(),
            "company_introduction": res.soup.select(".detail-content .text")[1].text.strip() if len(text) >=2 else "",
        })

        yield msg
Ejemplo n.º 2
0
# @Time         : 18-11-28 下午10:39
# @Author       : DioMryang
# @File         : Example.py
# @Description  :
import uuid

from DioFramework import Const
from DioFramework.Base.Job.Job import Job
from DioFramework.Base.Message import Message

testMsg = Message(type=Message.CONTENT)
testMsg.updateInfo({
    Const.MSG_FIELD.ENTER_URL: "http://dio.com",
    Const.MSG_FIELD.CONTENT: "la la la test Done"
})

testJob = Job(id="miao_miao_test")

testMsgs = [testMsg]