def crawl_lagou_job_data_task(lagou_company_id): """爬取拉勾职位数据任务""" # 过滤本轮已经爬取过职位的公司 if not redis_instance.setnx(constants.CRAWLED_COMPANY_JOBS_REDIS_KEY.format(lagou_company_id=lagou_company_id), 1): return jobs_pagination = crawlers.get_jobs_pagination_from_lagou(lagou_company_id=lagou_company_id, job_type=constants.LagouJobType.technology) for page_no in jobs_pagination.iter_pages: job_dicts = crawlers.get_jobs_from_lagou(lagou_company_id=lagou_company_id, job_type=constants.LagouJobType.technology, page_no=page_no) if not job_dicts: break for job_dict in job_dicts: crawlers.clean_lagou_job_data(job_dict) utils.convert.convert_dict_field_to_constants(job_dict) keywords = job_dict.pop('keywords') city_name = job_dict.pop('city_name') city_ctl.insert_city_if_not_exist(city_name) job_dict['city_id'] = city_ctl.get_city_id_by_name(city_name) company = CompanyModel.get_one(filter_by={'lagou_company_id': lagou_company_id}) job_dict['company_id'] = company.id job = JobModel.get_one(filter_by={'lagou_job_id': job_dict.lagou_job_id}) if job: JobModel.update_by_pk(pk=job.id, values=job_dict) else: job_id = JobModel.add(**job_dict) for keyword in keywords: keyword_ctl.insert_keyword_if_not_exist(name=keyword) keyword_id = keyword_ctl.get_keyword_id_by_name(name=keyword) JobKeywordModel.add(keyword_id=keyword_id, job_id=job_id)
def crawl_lagou_job_data_suites(lagou_company_id): jobs_pagination = lagou_jobs_scripts.crawl_lagou_jobs_pagination( lagou_company_id=lagou_company_id, job_type=constants.LagouJobType.technology) for page_no in jobs_pagination.iter_pages: job_dicts = lagou_jobs_scripts.crawl_lagou_jobs( lagou_company_id=lagou_company_id, job_type=constants.LagouJobType.technology, page_no=page_no) if not job_dicts: break for job_dict in job_dicts: if not job_dict.is_exist: lagou_jobs_scripts.clean_lagou_job_data(job_dict) lagou_jobs_scripts.convert_lagou_job_data(job_dict) company = CompanyModel.get_one( filter_by={'lagou_company_id': lagou_company_id}) job_dict['company_id'] = company.id keywords = job_dict.pop('keywords') advantage = job_dict.pop('advantage') description = job_dict.pop('description') job_dict.pop('city') job_id = JobModel.add(**job_dict) JobExtraModel.add(advantage=advantage, description=description, job_id=job_id) for keyword in keywords: keyword_ctl.insert_keyword_if_not_exist(name=keyword) keyword_id = keyword_ctl.get_keyword_id_by_name( name=keyword) JobKeywordModel.add(keyword_id=keyword_id, job_id=job_id)
def test_add(self): to_add_data_dict = dict(title=u'后端吃饭工程师', work_year=2, city_id=1, company_id=1, department='飞天面条神教招聘', salary='20k-30k', education=2, description=u'日常工作:吃饭!', advantage='饭管饱, 管够', nature=0) job_id = JobModel.add(**to_add_data_dict) self.assertTrue(job_id > 0) job = JobModel.get_by_pk(pk=job_id) self.assertDictContainsSubset(to_add_data_dict, job.dict()) self.assertGreater(job.created_at, 0) self.assertGreater(job.updated_at, 0)
def test_add(self): to_add_data_dict = dict( lg_job_id=10004, city_id=3, company_id=1, title='Python 开发工程师', work_year=5, department='吖吖项目组', salary='15k-35k', education=2, nature=1, description='职位介绍D', advantage='16薪,工作居住证,六十八险一金,双休', ) job_id = JobModel.add(**to_add_data_dict) self.assertTrue(job_id > 0) job = JobModel.get_by_pk(pk=job_id) self.assertDictContainsSubset(to_add_data_dict, job.dict())