def generate_company_data(company, city_id): """生成公司数据""" company_id = company['companyId'] shortname = filter_http_tag(company['companyShortName']) fullname = filter_http_tag(company['companyFullName']) finance_stage = filter_http_tag(company['financeStage']).upper() if finance_stage not in FINANCE_STAGE_DICT: logger.error(company['financeStage'] + 'not in FINANCE_STAGE_DICT') finance_stage = FINANCE_STAGE_DICT[finance_stage] \ if finance_stage in FINANCE_STAGE_DICT else FINANCE_STAGE_DICT['unknown'] process_rate = company['processRate'] if 'processRate' in company else -1 features = filter_http_tag(company['companyFeatures']) advantage, address, size, introduce = requests_company_detail_data( company_id=company_id) CompanyController.add( id=company_id, shortname=shortname, fullname=fullname, finance_stage=finance_stage, process_rate=process_rate, city_id=city_id, features=features, advantage=advantage, address=address, size=size, introduce=introduce, ) industry_fields = set(re.split(",|,|、", company['industryField'])) for industry_field in industry_fields: industry_id = IndustryController.get_industry_id_by_name( industry_field) CompanyIndustryController.add(company_id, industry_id, city_id)
def update_company_data(city_id, finance_stage_id, industry_id): """更新公司数据""" logger.info('正在爬取城市={}, 融资类型={}, 行业类别={}'.format(city_id, finance_stage_id, industry_id)) # 生成访问链接 url = constants.CITY_COMPANY_URL.format(city=city_id, finance_stage=finance_stage_id, industry=industry_id) response = request_company_json(url, page_no=1) # 计算需要爬取的页数 page_count = int( ceil(int(response['totalCount']) / int(response['pageSize']))) for page_no in range(1, page_count + 1): logger.info('正在爬取城市={}, 融资类型={}, 行业类别={}, 第 「{}」 页'.format( city_id, finance_stage_id, industry_id, page_no)) response = request_company_json(url=url, page_no=page_no) companys = response['result'] if len(companys) == 0: break for company in companys: company_id = int(company['companyId']) if CompanyController.count(id=company_id) == 0: generate_company_data(company=company, city_id=city_id) # 更新公司下职位的数据 if not redis_instance.sismember( constants.REDIS_VISITED_COMPANY_KEY, company_id): redis_instance.sadd(constants.REDIS_VISITED_COMPANY_KEY, company_id) update_job_data(company_id=company_id) logger.info('爬取城市={}, 融资类型={}, 行业类别={}, 任务结束'.format( city_id, finance_stage_id, industry_id))
def finance_stage_distribution_analyze(cls, jobs): """分析招聘该职位的公司的融资分布情况""" company_ids = [job.company_id for job in jobs] companys = CompanyController.list(ids=company_ids) reversed_finance_stage_dict_dict = reverse_dict( constants.FINANCE_STAGE_DICT) # 特定排序的融资dict finance_stage_distribution = { '成熟型(不需要融资)': 0, '成长型(不需要融资)': 0, '初创型(不需要融资)': 0, '上市公司': 0, '成熟型(D轮及以上)': 0, '成熟型(C轮)': 0, '成长型(B轮)': 0, '成长型(A轮)': 0, '初创型(天使轮)': 0, '初创型(未融资)': 0, } for company in companys: finance_stage = reversed_finance_stage_dict_dict[ company.finance_stage] if finance_stage in finance_stage_distribution: finance_stage_distribution[ finance_stage] = finance_stage_distribution[ finance_stage] + 1 return finance_stage_distribution
def resolve_all_companies(*_): result = CompanyController.get_all_companies() return result