Exemple #1
0
def crawl_lagou_job_data_task(lagou_company_id):
    """爬取拉勾职位数据任务"""
    # 过滤本轮已经爬取过职位的公司
    if not redis_instance.setnx(constants.CRAWLED_COMPANY_JOBS_REDIS_KEY.format(lagou_company_id=lagou_company_id), 1):
        return
    jobs_pagination = crawlers.get_jobs_pagination_from_lagou(lagou_company_id=lagou_company_id,
                                                              job_type=constants.LagouJobType.technology)
    for page_no in jobs_pagination.iter_pages:
        job_dicts = crawlers.get_jobs_from_lagou(lagou_company_id=lagou_company_id,
                                                 job_type=constants.LagouJobType.technology,
                                                 page_no=page_no)
        if not job_dicts:
            break
        for job_dict in job_dicts:
            crawlers.clean_lagou_job_data(job_dict)
            utils.convert.convert_dict_field_to_constants(job_dict)

            keywords = job_dict.pop('keywords')
            city_name = job_dict.pop('city_name')

            city_ctl.insert_city_if_not_exist(city_name)
            job_dict['city_id'] = city_ctl.get_city_id_by_name(city_name)
            company = CompanyModel.get_one(filter_by={'lagou_company_id': lagou_company_id})
            job_dict['company_id'] = company.id

            job = JobModel.get_one(filter_by={'lagou_job_id': job_dict.lagou_job_id})
            if job:
                JobModel.update_by_pk(pk=job.id, values=job_dict)
            else:
                job_id = JobModel.add(**job_dict)

                for keyword in keywords:
                    keyword_ctl.insert_keyword_if_not_exist(name=keyword)
                    keyword_id = keyword_ctl.get_keyword_id_by_name(name=keyword)
                    JobKeywordModel.add(keyword_id=keyword_id, job_id=job_id)
Exemple #2
0
def crawl_lagou_job_data_suites(lagou_company_id):
    jobs_pagination = lagou_jobs_scripts.crawl_lagou_jobs_pagination(
        lagou_company_id=lagou_company_id,
        job_type=constants.LagouJobType.technology)
    for page_no in jobs_pagination.iter_pages:
        job_dicts = lagou_jobs_scripts.crawl_lagou_jobs(
            lagou_company_id=lagou_company_id,
            job_type=constants.LagouJobType.technology,
            page_no=page_no)
        if not job_dicts:
            break
        for job_dict in job_dicts:
            if not job_dict.is_exist:
                lagou_jobs_scripts.clean_lagou_job_data(job_dict)
                lagou_jobs_scripts.convert_lagou_job_data(job_dict)

                company = CompanyModel.get_one(
                    filter_by={'lagou_company_id': lagou_company_id})
                job_dict['company_id'] = company.id
                keywords = job_dict.pop('keywords')
                advantage = job_dict.pop('advantage')
                description = job_dict.pop('description')
                job_dict.pop('city')

                job_id = JobModel.add(**job_dict)
                JobExtraModel.add(advantage=advantage,
                                  description=description,
                                  job_id=job_id)

                for keyword in keywords:
                    keyword_ctl.insert_keyword_if_not_exist(name=keyword)
                    keyword_id = keyword_ctl.get_keyword_id_by_name(
                        name=keyword)
                    JobKeywordModel.add(keyword_id=keyword_id, job_id=job_id)
Exemple #3
0
    def test_add(self):
        to_add_data_dict = dict(title=u'后端吃饭工程师',
                                work_year=2,
                                city_id=1,
                                company_id=1,
                                department='飞天面条神教招聘',
                                salary='20k-30k',
                                education=2,
                                description=u'日常工作:吃饭!',
                                advantage='饭管饱, 管够',
                                nature=0)
        job_id = JobModel.add(**to_add_data_dict)

        self.assertTrue(job_id > 0)
        job = JobModel.get_by_pk(pk=job_id)
        self.assertDictContainsSubset(to_add_data_dict, job.dict())
        self.assertGreater(job.created_at, 0)
        self.assertGreater(job.updated_at, 0)
Exemple #4
0
 def test_add(self):
     to_add_data_dict = dict(
         lg_job_id=10004,
         city_id=3,
         company_id=1,
         title='Python 开发工程师',
         work_year=5,
         department='吖吖项目组',
         salary='15k-35k',
         education=2,
         nature=1,
         description='职位介绍D',
         advantage='16薪,工作居住证,六十八险一金,双休',
     )
     job_id = JobModel.add(**to_add_data_dict)
     self.assertTrue(job_id > 0)
     job = JobModel.get_by_pk(pk=job_id)
     self.assertDictContainsSubset(to_add_data_dict, job.dict())