Beispiel #1
0
    def Spider(self):
        while not self.pagequeue.empty():
            url = self.pagequeue.get()
            print('正在爬取:{}'.format(url))
            req = requests.get(url, headers=get_header())
            req.encoding = 'gbk'
            html = etree.HTML(req.text)
            for i in range(4, 54):

                try:
                    title = html.xpath(
                        '//*[@id="resultList"]/div[{}]/p/span/a/@title'.format(
                            i))
                    if title[0] == None:
                        break
                    name = html.xpath(
                        '//*[@id="resultList"]/div[{}]/span[1]/a/text()'.
                        format(i))
                    url = html.xpath(
                        '//*[@id="resultList"]/div[{}]/p/span/a/@href'.format(
                            i))
                    print(url[0])
                    area = html.xpath(
                        '//*[@id="resultList"]/div[{}]/span[2]/text()'.format(
                            i))
                    salery = html.xpath(
                        '//*[@id="resultList"]/div[{}]/span[3]/text()'.format(
                            i))
                    time = html.xpath(
                        '//*[@id="resultList"]/div[{}]/span[4]/text()'.format(
                            i))
                    req1 = requests.get(url[0], headers=get_header())
                    req1.encoding = 'gb2312'
                    html1 = etree.HTML(req1.text)
                    detail = ''.join(
                        html1.xpath(
                            '//*[@class="bmsg job_msg inbox"]//*/text()'))
                    if detail.isspace():
                        detail = ''.join(
                            html1.xpath(
                                '//*[@class="bmsg job_msg inbox"]/text()'))
                    print(detail)
                    gongsi = ''.join(
                        html1.xpath('//*[@class="tmsg inbox"]/text()'))
                    if gongsi.isspace():
                        gongsi = ''.join(
                            html1.xpath('//*[@class="tmsg inbox"]//*/text()'))
                    data = {
                        "职位名称": title[0],
                        "详细链接": url[0],
                        "公司名称": name[0],
                        "工作地点": area[0],
                        "薪资": salery[0] if len(salery) != 0 else None,
                        "发布时间": time[0],
                        "职位信息": detail,
                        "公司信息": gongsi
                    }
                    self.jobqueue.put(data)
                except:
                    continue
Beispiel #2
0
 def Spider(self):
     for i in range(1, 31):
         s = requests.Session()
         s.get(
             url=
             'https://www.lagou.com/jobs/list_运维?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=',
             headers=get_header(),
             timeout=3)
         cookie = s.cookies
         req = requests.post(self.baseurl,
                             headers=self.header,
                             data={
                                 'first': True,
                                 'pn': i,
                                 'kd': self.keyword
                             },
                             params={
                                 'px': 'default',
                                 'city': self.city,
                                 'needAddtionalResult': 'false'
                             },
                             cookies=cookie,
                             timeout=3)
         text = req.json()
         datas = text['content']['positionResult']['result']
         for data in datas:
             s = requests.Session()
             s.get(
                 url=
                 'https://www.lagou.com/jobs/list_运维?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=',
                 headers=get_header(),
                 timeout=3)
             cookie1 = s.cookies
             url = 'https://www.lagou.com/jobs/' + str(
                 data.get('positionId')) + '.html'
             req1 = requests.get(url, headers=self.header, cookies=cookie1)
             req1.encoding = 'utf-8'
             html = etree.HTML(req1.text)
             detail = ''.join(
                 html.xpath('//*[@class="job-detail"]//*/text()')).strip()
             if detail.isspace():
                 detail = ''.join(
                     html.xpath('//*[@class="job-detail"]/text()')).strip()
             print(detail)
             data = {
                 "职位名称": data.get('positionName'),
                 "工作地点": data.get('district'),
                 "薪资": data.get('salary'),
                 "公司名称": data.get('companyFullName'),
                 "经验要求": data.get('workYear'),
                 "学历": data.get('education'),
                 "福利": data.get('positionAdvantage'),
                 "详细链接": url,
                 "职位信息": detail
             }
             self.data.put(data)
 def Spider(self):
     for i in range(1, 31):
         s = requests.Session()
         s.get(
             url='https://www.lagou.com/jobs/list_运维?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=',
             headers=get_header(), timeout=3)
         cookie = s.cookies
         req = requests.post(self.baseurl, headers=self.header, data={'first': True, 'pn': i, 'kd': self.keyword},
                             params={'px': 'default', 'city': self.city, 'needAddtionalResult': 'false'},
                             cookies=cookie, timeout=3)
         text = req.json()
         datas = text['content']['positionResult']['result']
         for data in datas:
             s = requests.Session()
             s.get(
                 url='https://www.lagou.com/jobs/list_运维?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=',
                 headers=get_header(), timeout=3)
             cookie1 = s.cookies
             url = 'https://www.lagou.com/jobs/' + str(data.get('positionId')) + '.html'
             req1 = requests.get(url, headers=self.header, cookies=cookie1)
             req1.encoding = 'utf-8'
             html = etree.HTML(req1.text)
             detail = ''.join(html.xpath('//*[@class="job-detail"]//*/text()')).strip()
             if detail.isspace():
                 detail = ''.join(html.xpath('//*[@class="job-detail"]/text()')).strip()
             print(detail)
             # datax = {
             #     "职位名称": data.get('positionName'),
             #     "工作地点": data.get('district'),
             #     "薪资": data.get('salary'),
             #     "公司名称": data.get('companyFullName'),
             #     "经验要求": data.get('workYear'),
             #     "学历": data.get('education'),
             #     "福利": data.get('positionAdvantage'),
             #     "详细链接": url,
             #     "职位信息": detail
             # }
             datax = {
                 "positionName": data.get('positionName'),
                 "district": data.get('district'),
                 "salary": data.get('salary'),
                 "companyFullName": data.get('companyFullName'),
                 "workYear": data.get('workYear'),
                 "education": data.get('education'),
                 "positionAdvantage": data.get('positionAdvantage'),
                 "url": url,
                 "detail": detail
             }
             self.data.put(datax)
             insert_sql = """
             insert into lagou_job(keyword,city,positionName,district,salary,companyFullName,workYear,education,positionAdvantage,url,detail) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
             """
             cursor.execute(insert_sql, (self.keyword,self.city,
             datax['positionName'], datax['district'], datax['salary'], datax['companyFullName'], datax['workYear'],
             datax['education'], datax['positionAdvantage'], datax['url'], datax['detail']))
             conn.commit()
Beispiel #4
0
    def Spider(self):
        while not self.pagequeue.empty():
            url = self.pagequeue.get()
            print('正在爬取:{}'.format(url))
            req = requests.get(url, headers=get_header())
            req.encoding = 'gbk'
            html = etree.HTML(req.text)
            for i in range(4, 54):

                try:
                    title = html.xpath('//*[@id="resultList"]/div[{}]/p/span/a/@title'.format(i))
                    if title[0] == None:
                        break
                    name = html.xpath('//*[@id="resultList"]/div[{}]/span[1]/a/text()'.format(i))
                    url = html.xpath('//*[@id="resultList"]/div[{}]/p/span/a/@href'.format(i))
                    area = html.xpath('//*[@id="resultList"]/div[{}]/span[2]/text()'.format(i))
                    salery = html.xpath('//*[@id="resultList"]/div[{}]/span[3]/text()'.format(i))
                    time = html.xpath('//*[@id="resultList"]/div[{}]/span[4]/text()'.format(i))
                    data = {
                        "职位名称": title[0],
                        "详细链接": url[0],
                        "公司名称": name[0],
                        "工作地点": area[0],
                        "薪资": salery[0],
                        "发布时间": time[0]
                    }
                    self.jobqueue.put(data)
                except:
                    break
Beispiel #5
0
    def Spider(self):
        page = 1
        city = self._get_city_code()
        data = []
        while 1:
            params = {
                'query': self.keyword,
                'page' : page,
                'city': city
            }
            req = requests.get(url=self.base_url, params=params, headers=get_header())
            print(req.url)
            req.encoding = req.apparent_encoding
            code = req.json().get('html')
            if code:
                html = etree.HTML(code)
                title = html.xpath('//*[@class="title"]/h4/text()')
                href = map(lambda x: 'https://www.zhipin.com'+x, html.xpath('//*[@class="item"]/a/@href'))
                salary = html.xpath('//*[@class="salary"]/text()')
                company = html.xpath('//*[@class="name"]/text()')
                area = html.xpath('//*[@class="msg"]/em[1]/text()')
                workingExp = html.xpath('//*[@class="msg"]/em[2]/text()')
                eduLevel = html.xpath('//*[@class="msg"]/em[3]/text()')
                for t,s,c,a,w,e,h in zip(title, salary, company, area, workingExp, eduLevel, href):
                    job = {}
                    job['职位名称'] = t
                    job['职位链接'] = h
                    job['公司名称'] = c
                    job['工作地点'] = a
                    job['薪资'] = s
                    job['工作经验'] = w
                    job['学历要求'] = e

                    jobDetail = BossModel()
                    jobDetail.name = t
                    jobDetail.url = h
                    jobDetail.company_name = c
                    jobDetail.area = a
                    jobDetail.salary = s
                    jobDetail.work_exp = w
                    jobDetail.edu_leve = e
                    jobDetail.city = self.city
                    jobDetail.key_word = self.keyword
                    saleryArray = s.split("-", 1)
                    if (len(saleryArray) >= 2):
                        saleryMin = saleryArray[0]
                        saleryMax = saleryArray[1]
                        jobDetail.salary_max = saleryMax
                        jobDetail.salary_min = saleryMin
                    # jobDetail.save()


                    data.append(job)
                page += 1
            else:
                break
        return data
 def _get_city_code(self):
     url = 'https://www.zhipin.com/wapi/zpCommon/data/city.json'
     req = requests.get(url=url, headers=get_header()).json()
     if req['message'] == 'Success':
         city_code_dict = req.get('zpData').get('cityList')
         for i in city_code_dict:
             for c in i['subLevelModelList']:
                 if c['name'] == self.city:
                     return str(c['code'])
         return '100010000'  # 全国
Beispiel #7
0
 def __init__(self, keyword, city='北京', thread=10, path=os.getcwd()):
     self.keyword = keyword
     self.city = city
     self.thread = thread
     self.csv_header = ['职位名称', '详细链接', '公司名称', '工作地点', '薪资', '发布时间']
     self.baseurl = 'https://search.51job.com/list/'
     self.header = get_header()
     self.path = path
     self.pagequeue = queue.Queue()
     self.jobqueue = queue.Queue()
 def searchRequests(self, url):
     """
         发起查询请求
         :param url: 猎聘网搜索职位的url
         :return: 返回搜索到的text
     """
     try:
         r = requests.get(url, headers=get_header())
         r.encoding = 'utf-8'
     except:
         r = requests.get("https://www.liepin.com" + url)
     finally:
         # with open("text.html", 'w') as f:
         #     f.write(r.text)
         return r.text
def get_one_page(query, n):
    data = {
        'query': query,
        'cityCode': 0,
        'pageNum': n,
    }
    url = 'https://www.kanzhun.com/search/interview.json?' + urlencode(data)
    try:
        response = requests.get(url, headers=get_header(), timeout=3)
        if response.status_code == 200:
            return response.text
        return None
    except ConnectionError:
        print('请求网页出错')
        return None
Beispiel #10
0
 def Spider(self):
     jobl = []
     for page in range(self.page):
         params = {
             "start": 90 * page,
             "pageSize": 90,
             "workExperience": -1,
             "education": -1,
             "companyType": -1,
             "employmentType": -1,
             "jobWelfareTag": -1,
             "kw": self.keyword,
             "kt": 3,
             "cityId": self.city,
             "salary": '0, 0'
         }
         req = requests.get(url=self.base_url, params=params, headers=get_header())
         data = req.json()['data']['results']
         if len(data) != 0:
             for job in data:
                 print(job)
                 jobd = {}
                 jobd['ID'] = job.get('number')
                 jobd['工作名称'] = job.get('jobName')
                 jobd['招聘详细链接'] = job.get('positionURL')
                 company = job.get('company')
                 jobd['公司名称'] = company.get('name')
                 jobd['公司ID'] = company.get('number')
                 jobd['公司性质'] = company.get('type').get('name')
                 jobd['公司规模'] = company.get('size').get('name')
                 jobd['公司招聘主页'] = company.get('url')
                 jobd['公司地点'] = job.get('city').get('display')
                 jobd['薪资'] = job.get('salary')
                 jobd['学历要求'] = job.get('eduLevel').get('name')
                 try:
                     jobd['工作经历'] = job.get('workingExp').get('name')
                 except:
                     jobd['工作经历'] = '经验不限'
                 jobd['职位类型'] = job.get('emplType')
                 jobd['公司福利'] = '、'.join(job.get('welfare')) or '无'
                 jobd['工作发布标签'] = job.get('timeState')
                 jobd['更新时间'] = job.get('updateDate')
                 jobl.append(jobd)
         else:
             break
     return jobl
Beispiel #11
0
 def Spider(self):
     page = 1
     city = self._get_city_code()
     data = []
     while 1:
         params = {'query': self.keyword, 'page': page, 'city': city}
         req = requests.get(url=self.base_url,
                            params=params,
                            headers=get_header())
         print(req.url)
         req.encoding = req.apparent_encoding
         code = req.json().get('html')
         if code:
             html = etree.HTML(code)
             title = html.xpath('//*[@class="title"]/h4/text()')
             href = map(lambda x: 'https://www.zhipin.com' + x,
                        html.xpath('//*[@class="item"]/a/@href'))
             salary = html.xpath('//*[@class="salary"]/text()')
             company = html.xpath('//*[@class="name"]/text()')
             area = html.xpath('//*[@class="msg"]/em[1]/text()')
             workingExp = html.xpath('//*[@class="msg"]/em[2]/text()')
             eduLevel = html.xpath('//*[@class="msg"]/em[3]/text()')
             for t, s, c, a, w, e, h in zip(title, salary, company, area,
                                            workingExp, eduLevel, href):
                 job = {}
                 job['职位名称'] = t
                 job['职位链接'] = h
                 job['公司名称'] = c
                 job['工作地点'] = a
                 job['薪资'] = s
                 job['工作经验'] = w
                 job['学历要求'] = e
                 data.append(job)
             page += 1
         else:
             break
     return data
Beispiel #12
0
    def Spider(self):
        while not self.pagequeue.empty():
            url = self.pagequeue.get()
            print('正在爬取:{}'.format(url))
            req = requests.get(url, headers=get_header())
            req.encoding = 'gbk'
            html = etree.HTML(req.text)
            for i in range(4, 54):

                try:
                    title = html.xpath(
                        '//*[@id="resultList"]/div[{}]/p/span/a/@title'.format(
                            i))
                    if title[0] == None:
                        break
                    name = html.xpath(
                        '//*[@id="resultList"]/div[{}]/span[1]/a/text()'.
                        format(i))
                    url = html.xpath(
                        '//*[@id="resultList"]/div[{}]/p/span/a/@href'.format(
                            i))
                    print(url[0])
                    area = html.xpath(
                        '//*[@id="resultList"]/div[{}]/span[2]/text()'.format(
                            i))
                    salery = html.xpath(
                        '//*[@id="resultList"]/div[{}]/span[3]/text()'.format(
                            i))
                    time = html.xpath(
                        '//*[@id="resultList"]/div[{}]/span[4]/text()'.format(
                            i))
                    req1 = requests.get(url[0], headers=get_header())
                    req1.encoding = 'gb2312'
                    html1 = etree.HTML(req1.text)
                    detail = ''.join(
                        html1.xpath(
                            '//*[@class="bmsg job_msg inbox"]//*/text()'))
                    if detail.isspace():
                        detail = ''.join(
                            html1.xpath(
                                '//*[@class="bmsg job_msg inbox"]/text()'))
                    print(detail)
                    gongsi = ''.join(
                        html1.xpath('//*[@class="tmsg inbox"]/text()'))
                    if gongsi.isspace():
                        gongsi = ''.join(
                            html1.xpath('//*[@class="tmsg inbox"]//*/text()'))

                    jobDetail = Qcwy()
                    jobDetail.title = title[0]
                    jobDetail.url = url[0]
                    jobDetail.company_name = name[0]
                    jobDetail.area = area[0]
                    jobDetail.salery = salery[0] if len(salery) != 0 else None
                    jobDetail.time = time[0]
                    jobDetail.detail = detail
                    jobDetail.company_info = gongsi

                    jobDetail.city = self.city
                    jobDetail.key_word = self.keyword

                    if len(salery) > 0:
                        salary = salery[0]
                        saleryArray = salary.split("-", 1)
                        if (len(saleryArray) >= 2):
                            saleryMin = saleryArray[0]
                            saleryMax = saleryArray[1]

                            jobDetail.salery_max = saleryMax
                            jobDetail.salery_min = saleryMin

                    # jobDetail.save()

                    data = {
                        "职位名称": title[0],
                        "详细链接": url[0],
                        "公司名称": name[0],
                        "工作地点": area[0],
                        "薪资": salery[0] if len(salery) != 0 else None,
                        "发布时间": time[0],
                        "职位信息": detail,
                        "公司信息": gongsi
                    }
                    self.jobqueue.put(data)
                except:
                    continue
    def Spider(self):
        for page in range(1, 11):
            params = {'query': self.keyword, 'page': page}
            # req = requests.get(url=self.base_url + 'c' +self._get_city_code(), params=params, headers=get_header())
            # print(req.url)
            # html = etree.HTML(req.text)

            browser.get("https://www.zhipin.com/c101280600?query=java&page=" +
                        str(page))
            htmlStr = browser.page_source
            html = pq(htmlStr)

            if page == 1:
                for i in range(1, 30):
                    title = browser.find_element_by_xpath(
                        '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/h3/a/div[1]'
                        .format(i)).text
                    link = browser.find_element_by_xpath(
                        '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/h3/a'.
                        format(i)).get_attribute('href')
                    name = browser.find_element_by_xpath(
                        '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/h3/a'
                        .format(i)).text
                    data1 = browser.find_element_by_xpath(
                        '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/p'.
                        format(i)).text.split(' ')
                    area = data1[0]
                    salery = browser.find_element_by_xpath(
                        '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/h3/a/span'
                        .format(i)).text
                    exp = data1[1]
                    study = data1[2]
                    belong = browser.find_element_by_xpath(
                        '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p'
                        .format(i)).text
                    status = browser.find_element_by_xpath(
                        '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p'
                        .format(i)).text
                    try:
                        size = browser.find_element_by_xpath(
                            '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p'
                            .format(i)).text
                    except:
                        size = '无'
                    Hr = browser.find_element_by_xpath(
                        '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[3]/h3'.
                        format(i)).text
                    req1 = requests.get(url=link, headers=get_header())
                    req1.encoding = 'utf-8'
                    html1 = etree.HTML(req1.text)
                    detail = ''.join(
                        html1.xpath(
                            '//*[@class="job-sec"][1]//*/text()')).strip()
                    if detail.isspace():
                        detail = ''.join(
                            html1.xpath(
                                '//*[@class="job-sec"][1]/text()')).strip()
                    print(detail)
                    gongsi = ''.join(
                        html1.xpath(
                            '//*[@class="job-sec company-info"]//*/text()')
                    ).strip()
                    gongshang = ''.join(
                        html1.xpath(
                            '//*[@class="job-sec"][3]//*/text()')).strip()
                    if '点击查看地图' in gongshang:
                        gongshang = ''.join(
                            html1.xpath(
                                '//*[@class="job-sec"][2]//*/text()')).strip()
                    data = {}
                    data.update(职位名称=title,
                                公司名称=name,
                                职位链接=link,
                                工作地点=area,
                                薪资=salery,
                                工作经验=exp,
                                学历要求=study,
                                所属领域=belong,
                                公司状态=status,
                                公司规模=size,
                                发布人=Hr,
                                职位信息=detail,
                                公司介绍=gongsi,
                                工商信息=gongshang)
                    self.downloadqueue.put(data)
            else:
                try:
                    for i in range(1, 31):
                        title = html.xpath(
                            '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/h3/a/div[1]/text()'
                            .format(i))[0]
                        name = html.xpath(
                            '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/h3/a/text()'
                            .format(i))[0]
                        link = self.base_url.rstrip('/') + html.xpath(
                            '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/h3/a/@href'
                            .format(i))[0]
                        data1 = html.xpath(
                            '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/p/text()'
                            .format(i))
                        area = data1[0]
                        salery = html.xpath(
                            '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/h3/a/span/text()'
                            .format(i))[0]
                        exp = data1[1]
                        study = data1[2]
                        belong = html.xpath(
                            '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[1]'
                            .format(i))[0]
                        status = html.xpath(
                            '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[2]'
                            .format(i))[0]
                        try:
                            size = \
                            html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[3]'.format(i))[0]
                        except:
                            size = '无'
                        who = html.xpath(
                            '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[3]/h3/text()[1]'
                            .format(i))[0]
                        try:
                            job = html.xpath(
                                '//*[@id="main"]/div/div[2]/ul/li[1]/div/div[3]/h3/text()[2]'
                                .format(i))[0]
                        except:
                            job = '无'
                        Hr = '{}/{}'.format(who, job)
                        req1 = requests.get(url=link, headers=get_header())
                        req1.encoding = 'utf-8'
                        html1 = etree.HTML(req1.text)
                        detail = ''.join(
                            html1.xpath(
                                '//*[@class="job-sec"][1]//*/text()')).strip()
                        gongsi = ''.join(
                            html1.xpath(
                                '//*[@class="job-sec company-info"]//*/text()')
                        ).strip()
                        gongshang = ''.join(
                            html1.xpath(
                                '//*[@class="job-sec"][3]//*/text()')).strip()
                        if '点击查看地图' in gongshang:
                            gongshang = ''.join(
                                html1.xpath(
                                    '//*[@class="job-sec"][2]//*/text()')
                            ).strip()
                        print(detail)
                        data = {}
                        data.update(职位名称=title,
                                    公司名称=name,
                                    职位链接=link,
                                    工作地点=area,
                                    薪资=salery,
                                    工作经验=exp,
                                    学历要求=study,
                                    所属领域=belong,
                                    公司状态=status,
                                    公司规模=size,
                                    发布人=Hr,
                                    职位信息=detail,
                                    公司介绍=gongsi,
                                    工商信息=gongshang)
                        self.downloadqueue.put(data)

                        insert_sql = """
                        insert into boss(title,name,link,area,salery,exp,study,belong,status,size,Hr,detail,gongsi,gongshang) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                        """
                        cursor.execute(insert_sql,
                                       (title, name, link, area, salery, exp,
                                        study, belong, status, size, Hr,
                                        detail, gongsi, gongshang))
                        conn.commit()
                except Exception as e:
                    continue
    def Spider(self):
        jobl = []
        for page in range(self.page):
            params = {
                "start": 90 * page,
                "pageSize": 90,
                "workExperience": -1,
                "education": -1,
                "companyType": -1,
                "employmentType": -1,
                "jobWelfareTag": -1,
                "kw": self.keyword,
                "kt": 3,
                "cityId": self.city,
                "salary": '0, 0'
            }
            req = requests.get(url=self.base_url,
                               params=params,
                               headers=get_header())
            cookie = req.cookies
            print(cookie)
            data = req.json()['data']['results']
            if len(data) != 0:
                for job in data:
                    # print(job)
                    jobd = {}
                    jobd['ID'] = job.get('number')
                    jobd['工作名称'] = job.get('jobName')
                    jobd['招聘详细链接'] = job.get('positionURL')
                    company = job.get('company')
                    jobd['公司名称'] = company.get('name')
                    jobd['公司ID'] = company.get('number')
                    jobd['公司性质'] = company.get('type').get('name')
                    jobd['公司规模'] = company.get('size').get('name')
                    jobd['公司招聘主页'] = company.get('url')
                    jobd['公司地点'] = job.get('city').get('display')
                    jobd['薪资'] = job.get('salary')
                    jobd['学历要求'] = job.get('eduLevel').get('name')
                    try:
                        jobd['工作经历'] = job.get('workingExp').get('name')
                    except:
                        jobd['工作经历'] = '经验不限'
                    jobd['职位类型'] = job.get('emplType')
                    jobd['公司福利'] = '、'.join(job.get('welfare')) or '无'
                    jobd['工作发布标签'] = job.get('timeState')
                    jobd['更新时间'] = job.get('updateDate')
                    header = get_header()
                    header['referer'] = job.get('positionURL')
                    header['upgrade-insecure-requests'] = '1'
                    ZHILIAN_COOKIE = 'adfbid2=0; x-zp-client-id=912a3ef6-bb0d-46c9-a396-ea9566250f8a; sts_deviceid=16bd0b9394299-0b310c2e8d3a19-e343166-2073600-16bd0b93943668; acw_tc=2760824a15648151356115930e8773f8ab7c33fc21043ce1c44b11cd7f5da1; _uab_collina=156481966519005574477937; urlfrom2=121114583; adfcid2=www.baidu.com; sou_experiment=capi; ZP_OLD_FLAG=false; CANCELALL=1; LastCity=%E5%8C%97%E4%BA%AC; LastCity%5Fid=530; dywea=95841923.417992036732763400.1562574932.1564819776.1564824996.3; dywez=95841923.1564824996.3.3.dywecsr=baidu|dyweccn=(organic)|dywecmd=organic; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1562574933,1564819776,1564824996; __utma=269921210.1630440294.1562574933.1564819776.1564824996.3; __utmz=269921210.1564824996.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216bd0b9395e473-06c4ddd9790ee2-e343166-2073600-16bd0b9395f866%22%2C%22%24device_id%22%3A%2216bd0b9395e473-06c4ddd9790ee2-e343166-2073600-16bd0b9395f866%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22baiduPC%22%2C%22%24latest_utm_medium%22%3A%22CPC%22%2C%22%24latest_utm_campaign%22%3A%22pp%22%2C%22%24latest_utm_content%22%3A%22pp%22%2C%22%24latest_utm_term%22%3A%228804373%22%7D%7D; sts_sg=1; sts_chnlsid=Unknown; jobRiskWarning=true; sts_sid=16c5fc1dd9645-03591d4b89f0ab-e343166-2073600-16c5fc1dd97a91; acw_sc__=5d479f892414863f1ea95e2ce7352ba574fd9d2f; zp_src_url=https%3A%2F%2Fjobs.zhaopin.com%2FCC120116927J00122526309.htm; ZL_REPORT_GLOBAL={%22jobs%22:{%22recommandActionidShare%22:%22de9c9056-061b-4d15-9b59-d930c831f19e-job%22%2C%22funczoneShare%22:%22dtl_best_for_you%22}%2C%22company%22:{%22actionid%22:%22fe6b5ca0-7e6b-4587-874f-4f39773534a2-company%22%2C%22funczone%22:%22hiring_jd%22}}; sts_evtseq=14'
                    # header['cookie'] = config.ZHILIAN_COOKIE
                    header['cookie'] = ZHILIAN_COOKIE
                    req1 = requests.get(
                        job.get('positionURL'),
                        headers=header,
                    )
                    req1.encoding = 'utf-8'
                    html = etree.HTML(req1.text)
                    detail = ''.join(
                        html.xpath(
                            '//*[@class="describtion__detail-content"]//*/text()'
                        ))
                    if not detail:
                        detail = ''.join(
                            html.xpath(
                                '//*[@class="describtion__detail-content"]/text()'
                            ))
                    print(detail)
                    jobd['职位描述'] = detail.strip()

                    insert_sql = """
                    insert into zhilian(number ,jobName ,positionURL ,comName ,comNumber ,comType ,comSize ,comUrl ,comCity ,salary ,eduLevel ,workingExp ,emplType ,walfare ,timeState ,updateDate ,detail) 
                    values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                    """
                    cursor.execute(
                        insert_sql,
                        (job.get('number'), job.get('jobName'),
                         job.get('positionURL'), company.get('name'),
                         company.get('number'),
                         company.get('type').get('name'),
                         company.get('size').get('name'), company.get('url'),
                         job.get('city').get('display'), job.get('salary'),
                         job.get('eduLevel').get('name'),
                         job.get('workingExp').get('name')
                         if job.get('workingExp')
                         and job.get('workingExp').get('name') else '经验不限',
                         job.get('emplType'), '、'.join(job.get('welfare'))
                         or '无', job.get('timeState'), job.get('updateDate'),
                         detail.strip()))
                    conn.commit()
                    jobl.append(jobd)
            else:
                break
        return jobl
Beispiel #15
0
 def Spider(self):
     for page in range(1, 11):
         params = {'query': self.keyword, 'page': page}
         req = requests.get(url=self.base_url + 'c' + self._get_city_code(),
                            params=params,
                            headers=get_header())
         print(req.url)
         html = etree.HTML(req.text)
         if page == 1:
             for i in range(1, 30):
                 title = html.xpath(
                     '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/h3/a/div[1]/text()'
                     .format(i))[0]
                 name = html.xpath(
                     '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/h3/a/text()'
                     .format(i))[0]
                 data1 = html.xpath(
                     '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/p/text()'
                     .format(i))
                 area = data1[0]
                 salery = html.xpath(
                     '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/h3/a/span/text()'
                     .format(i))[0]
                 exp = data1[1]
                 study = data1[2]
                 belong = html.xpath(
                     '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p/text()[1]'
                     .format(i))[0]
                 status = html.xpath(
                     '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p/text()[2]'
                     .format(i))[0]
                 try:
                     size = html.xpath(
                         '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p/text()[3]'
                         .format(i))[0]
                 except:
                     size = '无'
                 who = html.xpath(
                     '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[3]/h3/text()[1]'
                     .format(i))[0]
                 try:
                     job = html.xpath(
                         '//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[3]/h3/text()[2]'
                         .format(i))[0]
                 except:
                     job = '无'
                 Hr = '{}/{}'.format(who, job)
                 data = {}
                 data.update(职位名称=title,
                             公司名称=name,
                             工作地点=area,
                             薪资=salery,
                             工作经验=exp,
                             学历要求=study,
                             所属领域=belong,
                             公司状态=status,
                             公司规模=size,
                             发布人=Hr)
                 self.downloadqueue.put(data)
         else:
             try:
                 for i in range(1, 31):
                     title = html.xpath(
                         '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/h3/a/div[1]/text()'
                         .format(i))[0]
                     name = html.xpath(
                         '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/h3/a/text()'
                         .format(i))[0]
                     data1 = html.xpath(
                         '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/p/text()'
                         .format(i))
                     area = data1[0]
                     salery = html.xpath(
                         '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/h3/a/span/text()'
                         .format(i))[0]
                     exp = data1[1]
                     study = data1[2]
                     belong = html.xpath(
                         '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[1]'
                         .format(i))[0]
                     status = html.xpath(
                         '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[2]'
                         .format(i))[0]
                     try:
                         size = \
                         html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[3]'.format(i))[0]
                     except:
                         size = '无'
                     who = html.xpath(
                         '//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[3]/h3/text()[1]'
                         .format(i))[0]
                     try:
                         job = html.xpath(
                             '//*[@id="main"]/div/div[2]/ul/li[1]/div/div[3]/h3/text()[2]'
                             .format(i))[0]
                     except:
                         job = '无'
                     Hr = '{}/{}'.format(who, job)
                     data = {}
                     data.update(职位名称=title,
                                 公司名称=name,
                                 工作地点=area,
                                 薪资=salery,
                                 工作经验=exp,
                                 学历要求=study,
                                 所属领域=belong,
                                 公司状态=status,
                                 公司规模=size,
                                 发布人=Hr)
                     self.downloadqueue.put(data)
             except:
                 continue
 def Spider(self):
     for page in range(1, 11):
         params = {
             'query': self.keyword,
             'page' : page
         }
         req = requests.get(url=self.base_url + 'c' +self._get_city_code(), params=params, headers=get_header())
         print(req.url)
         html = etree.HTML(req.text)
         if page == 1:
             for i in range(1, 30):
                 title = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/h3/a/div[1]/text()'.format(i))[0]
                 link = self.base_url.rstrip('/') + html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/h3/a/@href'.format(i))[0]
                 name = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/h3/a/text()'.format(i))[0]
                 data1 = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/p/text()'.format(i))
                 area = data1[0]
                 salery = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[1]/h3/a/span/text()'.format(i))[0]
                 exp = data1[1]
                 study = data1[2]
                 belong = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p/text()[1]'.format(i))[0]
                 status = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p/text()[2]'.format(i))[0]
                 try: size = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[2]/div/p/text()[3]'.format(i))[0]
                 except: size = '无'
                 who = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[3]/h3/text()[1]'.format(i))[0]
                 try:
                     job = html.xpath('//*[@id="main"]/div/div[3]/ul/li[{}]/div/div[3]/h3/text()[2]'.format(i))[0]
                 except:
                     job = '无'
                 Hr = '{}/{}'.format(who, job)
                 req1 = requests.get(url=link, headers=get_header())
                 req1.encoding = 'utf-8'
                 html1 = etree.HTML(req1.text)
                 detail = ''.join(html1.xpath('//*[@class="job-sec"][1]//*/text()')).strip()
                 if detail.isspace():
                     detail = ''.join(html1.xpath('//*[@class="job-sec"][1]/text()')).strip()
                 print(detail)
                 gongsi = ''.join(html1.xpath('//*[@class="job-sec company-info"]//*/text()')).strip()
                 gongshang = ''.join(html1.xpath('//*[@class="job-sec"][3]//*/text()')).strip()
                 if '点击查看地图' in gongshang:
                     gongshang = ''.join(html1.xpath('//*[@class="job-sec"][2]//*/text()')).strip()
                 data = {}
                 data.update(职位名称=title, 公司名称=name, 职位链接=link, 工作地点=area, 薪资=salery, 工作经验=exp, 学历要求=study,
                             所属领域=belong, 公司状态=status, 公司规模=size, 发布人=Hr, 职位信息=detail, 公司介绍=gongsi, 工商信息=gongshang)
                 self.downloadqueue.put(data)
         else:
             try:
                 for i in range(1,31):
                     title = html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/h3/a/div[1]/text()'.format(i))[0]
                     name = html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/h3/a/text()'.format(i))[0]
                     link = self.base_url.rstrip('/') + html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/h3/a/@href'.format(i))[0]
                     data1 = html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/p/text()'.format(i))
                     area = data1[0]
                     salery = html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[1]/h3/a/span/text()'.format(i))[0]
                     exp = data1[1]
                     study = data1[2]
                     belong = html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[1]'.format(i))[0]
                     status = html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[2]'.format(i))[0]
                     try:
                         size = \
                         html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[2]/div/p/text()[3]'.format(i))[0]
                     except: size = '无'
                     who = html.xpath('//*[@id="main"]/div/div[2]/ul/li[{}]/div/div[3]/h3/text()[1]'.format(i))[0]
                     try:
                         job = html.xpath('//*[@id="main"]/div/div[2]/ul/li[1]/div/div[3]/h3/text()[2]'.format(i))[0]
                     except:
                         job = '无'
                     Hr = '{}/{}'.format(who, job)
                     req1 = requests.get(url=link, headers=get_header())
                     req1.encoding = 'utf-8'
                     html1 = etree.HTML(req1.text)
                     detail = ''.join(html1.xpath('//*[@class="job-sec"][1]//*/text()')).strip()
                     gongsi = ''.join(html1.xpath('//*[@class="job-sec company-info"]//*/text()')).strip()
                     gongshang = ''.join(html1.xpath('//*[@class="job-sec"][3]//*/text()')).strip()
                     if '点击查看地图' in gongshang:
                         gongshang = ''.join(html1.xpath('//*[@class="job-sec"][2]//*/text()')).strip()
                     print(detail)
                     data = {}
                     data.update(职位名称=title, 公司名称=name, 职位链接=link, 工作地点=area, 薪资=salery, 工作经验=exp, 学历要求=study,
                                 所属领域=belong, 公司状态=status, 公司规模=size, 发布人=Hr, 职位信息=detail, 公司介绍=gongsi,
                                 工商信息=gongshang)
                     self.downloadqueue.put(data)
             except Exception as e:
                 continue
Beispiel #17
0
 def Spider(self):
     jobl = []
     for page in range(self.page):
         params = {
             "start": 90 * page,
             "pageSize": 90,
             "workExperience": -1,
             "education": -1,
             "companyType": -1,
             "employmentType": -1,
             "jobWelfareTag": -1,
             "kw": self.keyword,
             "kt": 3,
             "cityId": self.city,
             "salary": '0, 0'
         }
         req = requests.get(url=self.base_url,
                            params=params,
                            headers=get_header())
         cookie = req.cookies
         print(cookie)
         data = req.json()['data']['results']
         if len(data) != 0:
             for job in data:
                 # print(job)
                 jobd = {}
                 jobd['ID'] = job.get('number')
                 jobd['工作名称'] = job.get('jobName')
                 jobd['招聘详细链接'] = job.get('positionURL')
                 company = job.get('company')
                 jobd['公司名称'] = company.get('name')
                 jobd['公司ID'] = company.get('number')
                 jobd['公司性质'] = company.get('type').get('name')
                 jobd['公司规模'] = company.get('size').get('name')
                 jobd['公司招聘主页'] = company.get('url')
                 jobd['公司地点'] = job.get('city').get('display')
                 jobd['薪资'] = job.get('salary')
                 jobd['学历要求'] = job.get('eduLevel').get('name')
                 try:
                     jobd['工作经历'] = job.get('workingExp').get('name')
                 except:
                     jobd['工作经历'] = '经验不限'
                 jobd['职位类型'] = job.get('emplType')
                 jobd['公司福利'] = '、'.join(job.get('welfare')) or '无'
                 jobd['工作发布标签'] = job.get('timeState')
                 jobd['更新时间'] = job.get('updateDate')
                 header = get_header()
                 header['referer'] = job.get('positionURL')
                 header['upgrade-insecure-requests'] = '1'
                 header['cookie'] = config.ZHILIAN_COOKIE
                 req1 = requests.get(
                     job.get('positionURL'),
                     headers=header,
                 )
                 req1.encoding = 'utf-8'
                 html = etree.HTML(req1.text)
                 detail = ''.join(
                     html.xpath(
                         '//*[@class="describtion__detail-content"]//*/text()'
                     ))
                 if not detail:
                     detail = ''.join(
                         html.xpath(
                             '//*[@class="describtion__detail-content"]/text()'
                         ))
                 print(job.get('positionURL'))
                 print(detail)
                 jobd['职位描述'] = detail.strip()
                 jobl.append(jobd)
         else:
             break
     return jobl