Ejemplo n.º 1
0
def crawl_company_stage(company_id):
    req_url = 'https://m.lagou.com/gongsi/%s.html' % str(company_id)
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Host': 'm.lagou.com',
        'Referer': 'https://m.lagou.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
    }
    response = requests.get(req_url, headers=headers, cookies=m_lagou_spider.get_cookies(), timeout=20)
    print(response.url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html5lib')
        company_desc = soup.find_all(class_="desc")[0].get_text().strip()
        industryField = company_desc.split('/')[0].strip()
        financeStage = company_desc.split('/')[1].strip()
        staffNum = company_desc.split('/')[2].strip()

    elif response.status_code == 403:
        log.error('403 forbidden...')
    else:
        log.error(response.status_code)
    time.sleep(config.TIME_SLEEP)

    return [company_id, industryField, financeStage, staffNum]
Ejemplo n.º 2
0
def crawl_company(havemark=0):

    #  定义公司信息为列表形式
    COMPANY_LIST = list()
    # 请求的url
    req_url = 'https://www.lagou.com/gongsi/0-0-0.json?havemark=%d' % havemark
    # 请求头部
    headers = {
        'Accept':
        'application/json, text/javascript, */*; q=0.01',
        'Content-Type':
        'application/x-www-form-urlencoded; charset=UTF-8',
        'Host':
        'www.lagou.com',
        'Origin':
        'https://www.lagou.com',
        'Referer':
        'https://www.lagou.com/gongsi/0-0-0?havemark=0',
        'User-Agent':
        'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 '
        'Mobile/13B143 Safari/601.1'
    }

    #  进行页数循环
    for pn in range(20):
        params = {
            'first': 'false',
            'pn': str(pn),
            'sortField': '0',
            'havemark': str(havemark)
        }

        response = requests.post(req_url,
                                 headers=headers,
                                 params=params,
                                 cookies=m_lagou_spider.get_cookies(),
                                 timeout=10)
        print(response.url)
        if response.status_code == 200:
            company_list_per_page = response.json()['result']
            for company in company_list_per_page:
                COMPANY_LIST.append([
                    company['companyId'], company['companyShortName'],
                    company['city'], company['companyFeatures'],
                    company['companyFullName'], company['financeStage'],
                    company['industryField'], company['interviewRemarkNum'],
                    company['positionNum'], company['processRate']
                ])
            log.info('page %d has been crawled down~' % (pn + 1))
        elif response.status_code == 403:
            log.error('403 forbidden...')
        else:
            log.error(response.status_code)
        # 睡眠
        time.sleep(config.TIME_SLEEP)

    return COMPANY_LIST
def get_max_page_no(company_id):
    """
    return the max page number of interviewees' comments based on particular company 
    :param company_id: 
    :return: 
    """
    request_url = 'https://www.lagou.com/gongsi/searchInterviewExperiences.json'
    headers = {
        'Accept':
        'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding':
        'gzip, deflate, br',
        'Host':
        'www.lagou.com',
        'Referer':
        'https://www.lagou.com',
        'User-Agent':
        'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0'
        ' Mobile/13B143 Safari/601.1',
        'Referer':
        'https://www.lagou.com/gongsi/interviewExperiences.html?companyId=%s' %
        company_id
    }

    params = {
        'companyId': company_id,
        'positionType': '',
        'pageSize': '10',
        'pageNo': '1'
    }

    response = requests.post(request_url,
                             headers=headers,
                             params=params,
                             cookies=get_cookies())
    if response.status_code == 200:
        maxpage = int(response.json()['content']['data']['page']['totalCount'])
    else:
        log.error('Error code is ' + str(response.status_code))
        maxpage = 0

    return int(maxpage / 10) + 1
def crawl_interviewee_comments(company_id):
    request_url = 'https://www.lagou.com/gongsi/searchInterviewExperiences.json'
    headers = {
        'Accept':
        'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding':
        'gzip, deflate, br',
        'Host':
        'www.lagou.com',
        'Referer':
        'https://www.lagou.com',
        'User-Agent':
        'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0'
        ' Mobile/13B143 Safari/601.1',
        'Referer':
        'https://www.lagou.com/gongsi/interviewExperiences.html?companyId=%s' %
        company_id
    }
    maxpage_no = get_max_page_no(company_id)

    if maxpage_no > 0:
        for pn in range(maxpage_no):
            params = {
                'companyId': company_id,
                'positionType': '',
                'pageSize': '10',
                'pageNo': str(pn + 1)
            }

            response = requests.post(request_url,
                                     headers=headers,
                                     params=params,
                                     cookies=get_cookies())
            log.info('Crawl page %s successfully~' % response.url)
            if response.status_code == 200:
                comment_list = response.json(
                )['content']['data']['page']['result']
                for comment in comment_list:
                    insert_item(comment)
                    log.info('insert one item successfully~')
                    """
                    intervieweeComment = IntervieweeComment()
                    intervieweeComment.id = comment['id']
                    intervieweeComment.companyId = comment['companyId']
                    intervieweeComment.companyName = comment['companyName']
                    intervieweeComment.companyScore = comment['companyScore']
                    intervieweeComment.comprehensiveScore = comment['comprehensiveScore']
                    intervieweeComment.interviewerScore = comment['interviewerScore']
                    intervieweeComment.describeScore = comment['describeScore']
                    intervieweeComment.myScore = comment['myScore']
                    intervieweeComment.content = comment['content']
                    intervieweeComment.createTime = comment['createTime']
                    intervieweeComment.hrId = comment['hrId']
                    intervieweeComment.positionId = comment['positionId']
                    intervieweeComment.positionName = comment['positionName']
                    intervieweeComment.positionStatus = comment['positionStatus']
                    intervieweeComment.positionType = comment['positionType']
                    intervieweeComment.tagArray = comment['tagArray']
                    intervieweeComment.usefulCount = comment['usefulCount']
    
                    insert_item(intervieweeComment)
                    """
            else:
                log.error('Error code is ' + str(response.status_code))

            time.sleep(config.TIME_SLEEP)