コード例 #1
0
def generate_company_data(company, city_id):
    """生成公司数据"""
    company_id = company['companyId']
    shortname = filter_http_tag(company['companyShortName'])
    fullname = filter_http_tag(company['companyFullName'])
    finance_stage = filter_http_tag(company['financeStage']).upper()
    if finance_stage not in FINANCE_STAGE_DICT:
        logger.error(company['financeStage'] + 'not in FINANCE_STAGE_DICT')
    finance_stage = FINANCE_STAGE_DICT[finance_stage] \
        if finance_stage in FINANCE_STAGE_DICT else FINANCE_STAGE_DICT['unknown']
    process_rate = company['processRate'] if 'processRate' in company else -1
    features = filter_http_tag(company['companyFeatures'])
    advantage, address, size, introduce = requests_company_detail_data(
        company_id=company_id)

    CompanyController.add(
        id=company_id,
        shortname=shortname,
        fullname=fullname,
        finance_stage=finance_stage,
        process_rate=process_rate,
        city_id=city_id,
        features=features,
        advantage=advantage,
        address=address,
        size=size,
        introduce=introduce,
    )
    industry_fields = set(re.split(",|,|、", company['industryField']))
    for industry_field in industry_fields:
        industry_id = IndustryController.get_industry_id_by_name(
            industry_field)
        CompanyIndustryController.add(company_id, industry_id, city_id)
コード例 #2
0
def update_company_data(city_id, finance_stage_id, industry_id):
    """更新公司数据"""
    logger.info('正在爬取城市={}, 融资类型={}, 行业类别={}'.format(city_id, finance_stage_id,
                                                     industry_id))
    # 生成访问链接
    url = constants.CITY_COMPANY_URL.format(city=city_id,
                                            finance_stage=finance_stage_id,
                                            industry=industry_id)
    response = request_company_json(url, page_no=1)
    # 计算需要爬取的页数
    page_count = int(
        ceil(int(response['totalCount']) / int(response['pageSize'])))

    for page_no in range(1, page_count + 1):
        logger.info('正在爬取城市={}, 融资类型={}, 行业类别={}, 第 「{}」 页'.format(
            city_id, finance_stage_id, industry_id, page_no))
        response = request_company_json(url=url, page_no=page_no)
        companys = response['result']
        if len(companys) == 0:
            break
        for company in companys:
            company_id = int(company['companyId'])
            if CompanyController.count(id=company_id) == 0:
                generate_company_data(company=company, city_id=city_id)
            # 更新公司下职位的数据
            if not redis_instance.sismember(
                    constants.REDIS_VISITED_COMPANY_KEY, company_id):
                redis_instance.sadd(constants.REDIS_VISITED_COMPANY_KEY,
                                    company_id)
                update_job_data(company_id=company_id)
    logger.info('爬取城市={}, 融资类型={}, 行业类别={}, 任务结束'.format(
        city_id, finance_stage_id, industry_id))
コード例 #3
0
ファイル: job.py プロジェクト: whattwitter/webspider
    def finance_stage_distribution_analyze(cls, jobs):
        """分析招聘该职位的公司的融资分布情况"""
        company_ids = [job.company_id for job in jobs]
        companys = CompanyController.list(ids=company_ids)

        reversed_finance_stage_dict_dict = reverse_dict(
            constants.FINANCE_STAGE_DICT)
        # 特定排序的融资dict
        finance_stage_distribution = {
            '成熟型(不需要融资)': 0,
            '成长型(不需要融资)': 0,
            '初创型(不需要融资)': 0,
            '上市公司': 0,
            '成熟型(D轮及以上)': 0,
            '成熟型(C轮)': 0,
            '成长型(B轮)': 0,
            '成长型(A轮)': 0,
            '初创型(天使轮)': 0,
            '初创型(未融资)': 0,
        }
        for company in companys:
            finance_stage = reversed_finance_stage_dict_dict[
                company.finance_stage]
            if finance_stage in finance_stage_distribution:
                finance_stage_distribution[
                    finance_stage] = finance_stage_distribution[
                        finance_stage] + 1
        return finance_stage_distribution
コード例 #4
0
def resolve_all_companies(*_):
    result = CompanyController.get_all_companies()
    return result