def test_EasyJobCrawlerHandler(): jobProcessor = EasyJobCrawlerHandler( config={ "id": "3", "params": { "spiders": [ "DioSpider.OldSpider.Boss.BossSearchSpider.BossSearchSpider", "DioSpider.OldSpider.Boss.BossJobSpider.BossJobSpider" ], "writer": { "id": 2, "params": { "db_name": "db", "collection_name": "boss" } } } }) msg = Message( info={ MSG_FIELD.ENTER_URL: "https://www.zhipin.com/c101280100/?query=%E7%88%AC%E8%99%AB&period=3&ka=sel-scale-3" }) job = Job(initMsgs=[msg]) jobProcessor.execute(job)
def run(): logging.info("准备跑数") writer = MessageMongodbWriter(config={ "id": -1, "params": { "db_name": "test", "collection_name": "boss" } }) job = Job(id="boss_crawl_dali") logging.info("搜索爬虫跑数") msgs = BossSearchSpider().crawl(ENTER_URL, {}) logging.info("搜索爬虫跑数 结束") for msg in msgs: enterUrl = msg.getEnterUrl() try: logging.info("处理url {}".format(enterUrl)) rst = list(BossJobSpider().crawl(enterUrl, {})) logging.info("写入数据 {}条".format(len(rst))) writer.run(job, rst) logging.info("写入成功") except Exception as e: logging.error("{} 跑数失败".format(enterUrl)) traceback.print_exc() logging.info("跑数结束")
def test_write(): job = Job(id="xxxxxx") msg = Message(info={"name": "mryang", "age": "18"}, type=SeedType.content) params = { "db_name": "dio", "collection_name": "person" } w = MongodbWriter(params=params) w.write(job, msg)
def run(self, job: Job, **kwargs) -> Job: """ 跑数 :param job: :return: """ context = kwargs.get("context") runnerId = context.get("runner_id") while True: # 获取json runnerJobMatch = Hash(Connection.REDIS_DEFAULT, self.runnerJobMatchName) jobJsonStr = runnerJobMatch.hget(runnerId) # 生成job if job is not None: job = Job.form(jobJsonStr) self.logger.info("{} get job {}".format(runnerId, jobJsonStr)) return job # 暂停数秒 TimeUtil.sleep(self.waitingTime)
def createTestJob(): return Job(id="dio_test")
def toPython(item): """构造成 Job对象""" return Job.form(item)
# @Time : 18-11-28 下午10:39 # @Author : DioMryang # @File : Example.py # @Description : import uuid from DioFramework import Const from DioFramework.Base.Job.Job import Job from DioFramework.Base.Message import Message testMsg = Message(type=Message.CONTENT) testMsg.updateInfo({ Const.MSG_FIELD.ENTER_URL: "http://dio.com", Const.MSG_FIELD.CONTENT: "la la la test Done" }) testJob = Job(id="miao_miao_test") testMsgs = [testMsg]