Beispiel #1
0
def total_pages(industry_url):
    response = requests_get(industry_url)
    html = etree.HTML(response.content)
    # total_counts = int(html.xpath("//span[@class='operate_txt']/i/text()")[0])
    total_pn = int(
        html.xpath(
            "//span[@class='num_operate']/i[@class='total_page']/text()")[0])
    return total_pn
Beispiel #2
0
def industry_by_area(industry_url):
    response = requests_get(industry_url)
    html = etree.HTML(response.content)
    area = html.xpath(
        '//div[@class="filter_item filter_area"]/ul[@class="filter_items clearfix"]/li/a/text()'
    )
    area_url = html.xpath(
        '//div[@class="filter_item filter_area"]/ul[@class="filter_items clearfix"]/li/a/@href'
    )
    area_df = pd.DataFrame(list(zip(area[1:], area_url[1:])),
                           columns=['行政区', '行业网址'])
    industry_db.insert_many(area_df.to_dict('records'))
Beispiel #3
0
def crawl_industry(city_url):
    response = requests_get(city_url)
    html = etree.HTML(response.content)
    industry_list = html.xpath("//ul[@class='indcatelist']/li/a/text()")
    industry_url = [
        city_url[:-6] + item
        for item in html.xpath("//ul[@class='indcatelist']/li/a/@href")
    ]
    industry_df = pd.DataFrame(list(zip(industry_list, industry_url)),
                               columns=['行业', '行业网址'])
    industry_df = industry_df.query('行业 != "不限"').reset_index(drop=True)
    industry_df['city_url'] = city_url
    industry_db.insert_many(industry_df.to_dict('records'))
Beispiel #4
0
def crawl_jobs(page_url):
    # page_url = 'https://sh.58.com/job/pn1/pve_5363_253_pve_5358_0/'
    response = requests_get(page_url)
    html = etree.HTML(response.content)
    total_page = int(html.xpath('//span[@class="total_page"]/text()'))
    jobs_url = html.xpath("//div[@class='job_name clearfix']/a/@href")
    jobs_url = [
        job_url.split('?')[0] for job_url in jobs_url
        if not job_url.startswith('https://legoclick')
    ]
    jobs_df = pd.DataFrame(jobs_url, columns=["job_url"])
    jobs_df['page_url'] = page_url
    jobs_urls.insert_many(jobs_df.to_dict('records'))
    print(page_url, '完成爬取...')
    return total_page
Beispiel #5
0
def parse_detail(job_url):
    response = requests_get(job_url, headers=HEADERS)
    code = response.status_code
    job_detail_dict = {
        "job_url": job_url,
        "status_code": code,
        "scrape_date": datetime.today().strftime('%Y/%m/%d')
    }
    if code == 200:
        doc = response.text
        html = etree.HTML(response.text)
        pub_date = findall('"pubDate":(.*?)"upDate"', doc)
        lontitude = findall('"lon":"(.*?)"', doc)
        latitude = findall('"lat":"(.*?)"}', doc)
        update_date = findall('"upDate":(.*?)}', doc)
        pos_title = xpath('//span[@class="pos_title"]/text()', html)
        pos_name = xpath('//span[@class="pos_name"]/text()', html)
        pos_salary = xpath('//span[@class="pos_salary"]/text()', html)
        pos_welfare = '、'.join(
            xpath('//span[@class="pos_welfare_item"]/text()',
                  html,
                  first=False))
        pos_condition = '、'.join(
            xpath('//div[@class="pos_base_condition"]/span/text()',
                  html,
                  first=False))
        pos_area = '-'.join(
            xpath('//span[@class="pos_area_item"]/text()', html, first=False))
        pos_address = xpath('//div[@class="pos-area"]/span[2]/text()', html)
        pos_description = xpath('//div[@class="des"]',
                                html,
                                first=False,
                                child=True)
        company_name = xpath('//div[@class="baseInfo_link"]/a/text()', html)
        company_url = xpath('//div[@class="baseInfo_link"]/a/@href', html)
        title_sign = xpath('//span[@class="baseInfo_sign"]/i/@title', html)
        company_industry = xpath('//p[@class="comp_baseInfo_belong"]/a/text()',
                                 html)
        company_scale = xpath('//p[@class="comp_baseInfo_scale"]/text()', html)
        job_offers = xpath('//a[@class="look_pos"]/@href', html)
        job_detail_dict.update({
            "pub_date": pub_date,
            "update_date": update_date,
            "lontitude": lontitude,
            "latitude": latitude,
            "pos_title": pos_title,
            "pos_name": pos_name,
            "pos_salary": pos_salary,
            "pos_welfare": pos_welfare,
            "pos_condition": pos_condition,
            "pos_area": pos_area,
            "pos_address": pos_address,
            "pos_description": pos_description,
            "company_name": company_name,
            "company_url": company_url,
            "title_sign": title_sign,
            "company_industry": company_industry,
            "company_scale": company_scale,
            "job_offers": job_offers,
        })
    return job_detail_dict