Example #1
0
def test_OrMessageMatchStrategy():
    params = {
        "enter_url": {
            "partialRegex": "http://ent.ifeng.com/a/20181128/\d+_\d+.shtml"
        },
        "spider_ids": {
            "equal": "12321"
        }
    }

    msg = Message()
    msg.info["enter_url"] = "http://ent.ifeng.com/a/20181128/43142134_0.shtml"

    strategy = OrMessageMatchStrategy(params=params)
    print(strategy.match(msg))
def test_EasyJobCrawlerHandler():
    jobProcessor = EasyJobCrawlerHandler(
        config={
            "id": "3",
            "params": {
                "spiders": [
                    "DioSpider.OldSpider.Boss.BossSearchSpider.BossSearchSpider",
                    "DioSpider.OldSpider.Boss.BossJobSpider.BossJobSpider"
                ],
                "writer": {
                    "id": 2,
                    "params": {
                        "db_name": "db",
                        "collection_name": "boss"
                    }
                }
            }
        })
    msg = Message(
        info={
            MSG_FIELD.ENTER_URL:
            "https://www.zhipin.com/c101280100/?query=%E7%88%AC%E8%99%AB&period=3&ka=sel-scale-3"
        })

    job = Job(initMsgs=[msg])
    jobProcessor.execute(job)
Example #3
0
    def crawl(self, enterUrl, info):
        setting = Setting()
        setting.htmlParse = True
        res = Downloader.get(enterUrl, setting)

        for aTag in res.soup.select(".info-primary h3 a"):
            yield Message(
                info={"enter_url": "https://www.zhipin.com" + aTag["href"]})
def test_run():
    testJob.setTemplateLoaderMapping(
        {"-2": TemplateLoader(TemplateConfig.getById(-2))})
    testMsg = Message(
        info={
            MSG_FIELD.ENTER_URL:
            "https://www.zhipin.com/job_detail/3265d372b1182c951HR50t2-ElU~.html"
        })
    for msg in TemplateSpider().execute(testJob, [testMsg]):
        TestUtil.pretty(msg)
def test_write():
    job = Job(id="xxxxxx")
    msg = Message(info={"name": "mryang", "age": "18"}, type=SeedType.content)

    params = {
        "db_name": "dio",
        "collection_name": "person"
    }

    w = MongodbWriter(params=params)
    w.write(job, msg)
def test_TemplateLoader():
    tpConfig = TemplateConfig.getById(-2)
    tp = TemplateLoader(tpConfig)
    testMsg = Message(
        info={
            MSG_FIELD.ENTER_URL:
            "https://www.zhipin.com/job_detail/3265d372b1182c951HR50t2-ElU~.html"
        })
    print(tp.match(testMsg))
    result = tp.execute(testJob, testMsg)
    pretty(result)
Example #7
0
    def crawl(self, enterUrl, info):
        res = Downloader.getWithBs4(enterUrl)

        text = res.soup.select(".detail-content .text")
        msg = Message(info=info)
        msg.updateInfo({
            "_id": TextUtil.getFirstMatch(res.text, "job_id: '(.*?)',"),
            "job_name": TextUtil.getFirstMatch(res.text, "job_name: '(.*?)',"),
            "job_salary": TextUtil.getFirstMatch(res.text, "job_salary: '(.*?)',"),
            "company": TextUtil.getFirstMatch(res.text, "company:'(.*?)',"),
            "city": TextUtil.getFirstMatch(res.text, "<p>城市:(.*?)<em class=\"vline\">"),
            "work_year": TextUtil.getFirstMatch(res.text, "<\/em>经验:(.*?)<em class=\"vline\">"),
            "pubDate": DateTimeUtil.getStandardDatetime(DateTimeUtil.guess(
                    TextUtil.getFirstMatch(res.text, "\"pubDate\": \"(\d+-\d+-\d+T\d+:\d+:\d+)\","))),
            "upDate": DateTimeUtil.getCurStandardDate(),
            "hr": res.soup.select_one("h2.name").text,

            "job_description": text[0].text.strip(),
            "company_introduction": res.soup.select(".detail-content .text")[1].text.strip() if len(text) >=2 else "",
        })

        yield msg
Example #8
0
def test_ScriptSpider():
    cfg = {
        "id": 15,
        "params": {
            "spider": {  # spider 配置
                "class_name":
                "DioSpider.OldSpider.Boss.BossJobSpider.BossJobSpider",
                "params": {}
            }
        }
    }

    spider = ScriptSpider(config=cfg)
    for msg in spider.run(testJob, [
            Message(
                info={
                    "enter_url":
                    "https://www.zhipin.com/job_detail/3265d372b1182c951HR50t2-ElU~.html"
                })
    ]):
        pretty(msg)
Example #9
0
def createTestMsg():
    return Message(info={"name": "mryang", "age": "18"})
Example #10
0
 def setInfoSpiderName(cls, msg: Message):
     msg.getInfo().update({MSG_FIELD.SPIDER_NAME: cls.__name__})
Example #11
0
 def toPython(item):
     """解析成 Message返回"""
     return Message.form(item)
Example #12
0
# @Time         : 18-11-28 下午10:39
# @Author       : DioMryang
# @File         : Example.py
# @Description  :
import uuid

from DioFramework import Const
from DioFramework.Base.Job.Job import Job
from DioFramework.Base.Message import Message

testMsg = Message(type=Message.CONTENT)
testMsg.updateInfo({
    Const.MSG_FIELD.ENTER_URL: "http://dio.com",
    Const.MSG_FIELD.CONTENT: "la la la test Done"
})

testJob = Job(id="miao_miao_test")

testMsgs = [testMsg]
Example #13
0
 def write(self, job: Job, message: Message):
     self.logger.info(" mongodb writer distribute {}".format(message))
     self.collection.insert_one(message.getInsertData())
Example #14
0
 def match(self, message: Message):
     return re.match(self.regex, message.getEnterUrl()) is not None