def test_OrMessageMatchStrategy(): params = { "enter_url": { "partialRegex": "http://ent.ifeng.com/a/20181128/\d+_\d+.shtml" }, "spider_ids": { "equal": "12321" } } msg = Message() msg.info["enter_url"] = "http://ent.ifeng.com/a/20181128/43142134_0.shtml" strategy = OrMessageMatchStrategy(params=params) print(strategy.match(msg))
def test_EasyJobCrawlerHandler(): jobProcessor = EasyJobCrawlerHandler( config={ "id": "3", "params": { "spiders": [ "DioSpider.OldSpider.Boss.BossSearchSpider.BossSearchSpider", "DioSpider.OldSpider.Boss.BossJobSpider.BossJobSpider" ], "writer": { "id": 2, "params": { "db_name": "db", "collection_name": "boss" } } } }) msg = Message( info={ MSG_FIELD.ENTER_URL: "https://www.zhipin.com/c101280100/?query=%E7%88%AC%E8%99%AB&period=3&ka=sel-scale-3" }) job = Job(initMsgs=[msg]) jobProcessor.execute(job)
def crawl(self, enterUrl, info): setting = Setting() setting.htmlParse = True res = Downloader.get(enterUrl, setting) for aTag in res.soup.select(".info-primary h3 a"): yield Message( info={"enter_url": "https://www.zhipin.com" + aTag["href"]})
def test_run(): testJob.setTemplateLoaderMapping( {"-2": TemplateLoader(TemplateConfig.getById(-2))}) testMsg = Message( info={ MSG_FIELD.ENTER_URL: "https://www.zhipin.com/job_detail/3265d372b1182c951HR50t2-ElU~.html" }) for msg in TemplateSpider().execute(testJob, [testMsg]): TestUtil.pretty(msg)
def test_write(): job = Job(id="xxxxxx") msg = Message(info={"name": "mryang", "age": "18"}, type=SeedType.content) params = { "db_name": "dio", "collection_name": "person" } w = MongodbWriter(params=params) w.write(job, msg)
def test_TemplateLoader(): tpConfig = TemplateConfig.getById(-2) tp = TemplateLoader(tpConfig) testMsg = Message( info={ MSG_FIELD.ENTER_URL: "https://www.zhipin.com/job_detail/3265d372b1182c951HR50t2-ElU~.html" }) print(tp.match(testMsg)) result = tp.execute(testJob, testMsg) pretty(result)
def crawl(self, enterUrl, info): res = Downloader.getWithBs4(enterUrl) text = res.soup.select(".detail-content .text") msg = Message(info=info) msg.updateInfo({ "_id": TextUtil.getFirstMatch(res.text, "job_id: '(.*?)',"), "job_name": TextUtil.getFirstMatch(res.text, "job_name: '(.*?)',"), "job_salary": TextUtil.getFirstMatch(res.text, "job_salary: '(.*?)',"), "company": TextUtil.getFirstMatch(res.text, "company:'(.*?)',"), "city": TextUtil.getFirstMatch(res.text, "<p>城市:(.*?)<em class=\"vline\">"), "work_year": TextUtil.getFirstMatch(res.text, "<\/em>经验:(.*?)<em class=\"vline\">"), "pubDate": DateTimeUtil.getStandardDatetime(DateTimeUtil.guess( TextUtil.getFirstMatch(res.text, "\"pubDate\": \"(\d+-\d+-\d+T\d+:\d+:\d+)\","))), "upDate": DateTimeUtil.getCurStandardDate(), "hr": res.soup.select_one("h2.name").text, "job_description": text[0].text.strip(), "company_introduction": res.soup.select(".detail-content .text")[1].text.strip() if len(text) >=2 else "", }) yield msg
def test_ScriptSpider(): cfg = { "id": 15, "params": { "spider": { # spider 配置 "class_name": "DioSpider.OldSpider.Boss.BossJobSpider.BossJobSpider", "params": {} } } } spider = ScriptSpider(config=cfg) for msg in spider.run(testJob, [ Message( info={ "enter_url": "https://www.zhipin.com/job_detail/3265d372b1182c951HR50t2-ElU~.html" }) ]): pretty(msg)
def createTestMsg(): return Message(info={"name": "mryang", "age": "18"})
def setInfoSpiderName(cls, msg: Message): msg.getInfo().update({MSG_FIELD.SPIDER_NAME: cls.__name__})
def toPython(item): """解析成 Message返回""" return Message.form(item)
# @Time : 18-11-28 下午10:39 # @Author : DioMryang # @File : Example.py # @Description : import uuid from DioFramework import Const from DioFramework.Base.Job.Job import Job from DioFramework.Base.Message import Message testMsg = Message(type=Message.CONTENT) testMsg.updateInfo({ Const.MSG_FIELD.ENTER_URL: "http://dio.com", Const.MSG_FIELD.CONTENT: "la la la test Done" }) testJob = Job(id="miao_miao_test") testMsgs = [testMsg]
def write(self, job: Job, message: Message): self.logger.info(" mongodb writer distribute {}".format(message)) self.collection.insert_one(message.getInsertData())
def match(self, message: Message): return re.match(self.regex, message.getEnterUrl()) is not None