Esempio n. 1
0
 def parse_position_page(self, response):
     position = response.xpath(
         "//div[contains(@class,'title-info')]/h1/text()").extract()[0]
     requirement = '\n'.join(
         response.xpath(
             "//div[contains(@class,'main-message')][1]/div/text()").
         extract())
     benefit = ','.join(
         response.xpath(
             "//div[contains(@class,'tag-list')]/span/text()").extract())
     try:
         companylink = response.xpath(
             "//div[@class='company-infor']/h4/a[1]/@href").extract()[0]
     except:
         companylink = ''
     item = LiepinItem()
     item['requirement'] = requirement
     item['companylink'] = companylink
     item['company'] = response.meta['company']
     item['position'] = position
     item['city'] = self.city_dict[response.meta['city']]
     item['salary'] = response.meta['salary']
     item['benefit'] = benefit
     item['pid'] = response.meta['pid']
     item['catagory'] = response.meta['key']
     item['rank'] = response.meta['rank']
     yield item
Esempio n. 2
0
    def parse_item(self, response):
        item = LiepinItem()
        item['position'] = self.get_position(response)
        item['welfare'] = self.get_welfare(response)
        item['annual'] = self.get_annual(response)
        item['worklocation'] = self.get_worklocation(response)
        item['education'] = self.get_education(response)
        item['condition'] = self.get_condition(response)
        item['company'] = self.get_company(response)
        item['companylocation'] = self.get_companylocation(response)
        item['type'] = self.get_type(response)
        item['scale'] = self.get_scale(response)

        yield item
Esempio n. 3
0
 def parse(self, response):
     print(response)
     title = response.css('title::text').extract_first()
     print(title)
     item = LiepinItem()
     positions = response.xpath(
         '//div[@class="job-content"]/div[@class="sojob-result "]/ul[@class="sojob-list"]'
     )
     ps = positions.xpath('//li')
     for p in ps:
         print(p)
         job_name = p.xpath(
             '//div[@class="job-info"]/h3/a/text()').extract()
         for j in job_name:
             item['job_name'] = j
             yield item
Esempio n. 4
0
 def parse(self, response):
     body = response.body.replace('\n', '').replace('\t',
                                                    '').replace('\r', '')
     info = re.findall(
         '<div class="job-info"><h3 title="(.*?)"><a href="(.*?)" data-promid="(.*?)" target="_blank"onclick="(.*?)">(.*?) </a></h3><p class="condition clearfix"title="(.*?)"><span class="text-warning">(.*?)</span><a href="(.*?)"data-selector="data-url" class="area">(.*?)</a><span class="edu">(.*?)</span><span>(.*?)</span></p><p class="time-info clearfix"><time title="(.*?)">(.*?)</time><span title="(.*?)">(.*?)</span></p></div>',
         body)
     for item in info:
         items = LiepinItem()
         items['title'] = item[0]
         items['info_url'] = item[1]
         items['workinfo'] = item[4]
         items['pay'] = item[6]
         items['worktime'] = item[10]
         yield Request(url=items['info_url'],
                       callback=self.detail,
                       meta={'item': items})
Esempio n. 5
0
    def parse_item(self, response):

        item = LiepinItem()

        city_name = self.city
        jobname1 = response.xpath('//div[@class="title-info"]/h1/text()')
        if jobname1 == []:
            jobname = response.xpath(
                '//div[@class="title-info "]/h1/text()')[0].extract()
        else:
            jobname = jobname1[0].extract()
        span = response.xpath(
            '//p[@class="basic-infor"]/span/a/text()').extract()
        if span == []:
            position = response.xpath(
                '//p[@class="basic-infor"]/span/text()')[0].extract()
        else:
            position = span[0]
        workingExp = response.xpath(
            '//div[@class="job-qualifications"]/span/text()')[1].extract()
        eduLevel = response.xpath(
            '//div[@class="job-qualifications"]/span/text()')[0].extract()
        salary = response.xpath('//div[@class="job-title-left"]/p/text()'
                                )[0].extract().rstrip('\r\n ')
        company_name = response.xpath(
            '//div[@class="title-info"]/h3/a/text()')[0].extract()
        update_time = response.xpath(
            '//p[@class="basic-infor"]/time/@title')[0].extract()
        job_require = response.xpath(
            '//div[@class="content content-word"]/text()').extract()

        sha1 = hashlib.sha1()
        string = (company_name + '' + update_time)
        stri = string.encode('utf8')
        sha1.update(stri)
        hash_id = sha1.hexdigest()

        for field in item.fields.keys():
            item[field] = eval(field)
        yield item
Esempio n. 6
0
    def parse(self, response):
        r = response.xpath('//ul[@class="sojob-list"]/li')
        for a in r:
            job_xueli = a.xpath(
                '//div[contains(@class,"job-info")]/p/span[@class="edu"]/text()'
            ).extract()
            job_jingyan = a.xpath(
                '//div[contains(@class,"job-info")]/p/span[@class="edu"]/following-sibling::span/text()'
            ).extract()
            job_xinshui = a.xpath(
                '//div[contains(@class,"job-info")]/p/span[@class="text-warning"]/text()'
            ).extract()
            job_shijian = a.xpath(
                '//div[contains(@class,"job-info")]/p/time/@title/text()'
            ).extract()
            job_zhicheng = [
                x.strip() for x in (
                    a.xpath('//div[contains(@class,"job-info")]/h3/a/text()')
                ).extract()
            ]
            job_company_name = a.xpath(
                '//div[contains(@class,"sojob-item-main")]//p[@class="company-name"]/a/text()'
            ).extract()
            job_url = a.xpath(
                '//div[contains(@class,"job-info")]/h3/a/@href').extract()
            job_company_url = a.xpath(
                '//div[contains(@class,"sojob-item-main")]//p[@class="company-name"]/a/@href'
            ).extract()

        item = LiepinItem()
        item["liepin_jingyan"] = job_jingyan
        item["liepin_xueli"] = job_xueli
        item["job_xinshui"] = job_xinshui
        item["job_zhicheng"] = job_zhicheng
        item["job_company_name"] = job_company_name
        item["job_url"] = job_url
        item["job_company_url"] = job_company_url
        yield item
Esempio n. 7
0
 def parse(self, response):
     sel = Selector(response)
     #获得猎聘网url和名称,薪资  ,发布时间
     jobs = sel.xpath('//div[@class="job-info"]')
     for job in jobs:
         article_name = job.xpath('h3/a/span/text()').extract()
         article_url = job.xpath('h3/a/@href').extract()
         article_xinzi = job.xpath('p[@class="condition clearfix"]/span'
                                   ).xpath('string(.)').extract()
         article_time = job.xpath('p[@class="time-info clearfix"]').xpath(
             'string(.)').extract()
         item = LiepinItem()
         item['article_name'] = article_name
         item['article_url'] = article_url
         item['article_xinzi'] = article_xinzi
         item['article_time'] = article_time
         yield item
     #获取下一篇文章内容
     urls = sel.xpath('//div[@class="pagerbar"]/a/@href').extract()
     for url in urls:
         print url
         url = "http://bj.liepin.com" + url
         print url
         yield Request(url, callback=self.parse)
Esempio n. 8
0
 def detail_parse(self, response):
     panduan = lambda x: x[0] if x else 'unknown'
     job = LiepinItem()
     #如果是'/a/'类型网页
     if '/a/' in response.url:
         #职位名称
         job['name'] = response.xpath(
             '//div[@class="title-info"]/h1/text() | //div[@class="title-info "]/h1/text()'
         ).extract()[0]
         #公司名称
         job['co_name'] = response.xpath(
             '//div[@class="title-info"]/h3/text() | //div[@class="title-info "]/h3/text()'
         ).extract()[0].strip()
         #区域
         job['area'] = response.xpath(
             '//div[@class="title"]//p[@class="basic-infor"]/span/text()'
         ).extract()[0]
         #薪资
         job['salary'] = response.xpath(
             '//div[@class="title"]//p[@class="job-main-title"]/text()'
         ).extract()[0].strip()
         #经验
         job['exp'] = response.xpath(
             '//div[@class="resume clearfix"]/span[2]/text()').extract()[0]
         #学历
         job['edu'] = response.xpath(
             '//div[@class="resume clearfix"]/span[1]/text()').extract()[0]
         #招聘人数
         job['num'] = 'unknown'
         #发布时间
         job['time'] = response.xpath(
             '//div[@class="job-title-left"]/p/time/text()').extract(
             )[0].strip()
         #其他要求
         otherqlist = response.xpath(
             '//div[@class="resume clearfix"]/span[position()>2]/text()'
         ).extract()
         job['otherq'] = ','.join(otherqlist)
         #福利
         fulis = []
         fuliList = response.xpath(
             '//div[@class="job-main main-message"][3]//ul/li')
         for fuli in fuliList:
             fulis.append(
                 fuli.xpath('./span/text()').extract()[0] + ':' +
                 fuli.xpath('./text()').extract()[0])
         job['welfare'] = ','.join(fulis)
         #职位信息
         infolist = response.xpath(
             '//div[@class="job-main main-message"][1]/div[@class="content content-word"]/text()'
         ).extract()
         job['info'] = ' '.join(infolist)
         #上班地址
         job['local'] = 'unknown'
         #公司网址
         job['co_url'] = 'unknown'
         #公司类别
         job['co_type'] = response.xpath(
             '//div[@class="job-main main-message"][2]//ul/li[5]/text()'
         ).extract()[0]
     #如果是 '/job/'类型网页
     elif '/job/' in response.url:
         #职位名称
         job['name'] = response.xpath(
             '//div[@class="title-info"]/h1/text()').extract()[0]
         #公司名称
         job['co_name'] = response.xpath(
             '//div[@class="title-info"]/h3/a/text()').extract()[0].strip()
         #区域
         job['area'] = response.xpath(
             '//div[@class="job-item"]//p[@class="basic-infor"]/span/a/text()'
         ).extract()[0]
         #薪资
         job['salary'] = response.xpath(
             '//div[@class="job-item"]//p[@class="job-item-title"]//text()'
         ).extract()[0].strip()
         #经验
         job['exp'] = response.xpath(
             '//div[@class="job-qualifications"]/span[2]/text()').extract(
             )[0]
         #学历
         job['edu'] = response.xpath(
             '//div[@class="job-qualifications"]/span[1]/text()').extract(
             )[0]
         #招聘人数
         job['num'] = 'unknown'
         #发布时间
         job['time'] = response.xpath(
             '//div[@class="job-title-left"]/p/time/text()').extract(
             )[0].strip()
         #其他要求
         otherqlist = response.xpath(
             '//div[@class="job-qualifications"]/span[position()>2]/text()'
         ).extract()
         job['otherq'] = ','.join(otherqlist)
         #福利
         welist = response.xpath(
             '//div[@class="tag-list"]/span/text()').extract()
         job['welfare'] = ','.join(welist)
         #职位信息
         infolist = response.xpath(
             '//div[@class="content content-word"]//text()').extract()
         job['info'] = ' '.join(infolist)
         #上班地址
         job['local'] = response.xpath(
             '//div[@class="company-infor"]//ul[@class="new-compintro"]//li[3]//text()'
         ).extract()[0].split(':'.decode('utf8')).pop()
         #公司网址
         job['co_url'] = response.xpath(
             '//div[@class="company-infor"]//div[@class="company-logo"]//p/a/@href'
         ).extract()[0]
         #公司类型
         if response.xpath(
                 '//ul[@class="new-compintro"]/li[1]/a/text()').extract():
             job['co_type'] = response.xpath(
                 '//ul[@class="new-compintro"]/li[1]/a/text()').extract()[0]
         else:
             job['co_type'] = response.xpath(
                 '//ul[@class="new-compintro"]/li[1]/text()').extract()[0]
     #如果是'/cjob/'网页
     else:
         #职位名称
         job['name'] = response.xpath(
             '//div[@class="job-title"]/h1/text()').extract()[0]
         #公司名称
         job['co_name'] = response.xpath(
             '//div[@class="job-title"]/h2/text()').extract()[0]
         #区域
         job['area'] = response.xpath(
             '//div[@class="job-main"]/p[@class="job-main-tip"]/span[1]/text()[2]'
         ).extract()[0]
         #薪资
         job['salary'] = response.xpath(
             '//div[@class="job-main"]/div[@class="job-main-title"]/strong/text()'
         ).extract()[0]
         #经验
         job['exp'] = panduan(
             response.xpath(
                 '//div[@class="job-main"]/p[@class="job-qualifications"]/span[2]/text()'
             ).extract())
         #学历
         job['edu'] = panduan(
             response.xpath(
                 '//div[@class="job-main"]/p[@class="job-qualifications"]/span[1]/text()'
             ).extract())
         #招聘人数
         job['num'] = 'unknown'
         #发布时间
         job['time'] = response.xpath(
             '//p[@class="job-main-tip"]/span[2]/text()').extract(
             )[0].strip()
         #其他要求
         job['otherq'] = 'unknown'
         #福利
         wellist = panduan(
             response.xpath(
                 '//p[@class="job-labels"]/span/text()').extract())
         job['welfare'] = ','.join(wellist)
         #职位信息
         job['info'] = response.xpath(
             '//div[@class="job-info"]//div[@class="job-info-content"]/text()'
         ).extract()[0].strip()
         #上班地址
         job['local'] = response.xpath(
             '//div[@class="side-box right-post-map"]/div[@class="side-content"]/p/text()'
         ).extract()[0]
         #公司网址
         job['co_url'] = 'unknown'
         #公司类型
         job['co_type'] = 'unknown'
     yield job
Esempio n. 9
0
 def parse(self, response, **kwargs):
     item = LiepinItem()
     title = response.xpath(
         '//div[@class=\'job-info\']/h3/@title').extract()
     item['title'] = title
     return item
    def parse(self, response):
        jobs_xpath = '/html/body[@id="sojob"]//div[@class="sojob-item-main clearfix"]'
        for sel in response.xpath(jobs_xpath):
            job = sel.xpath('div[@class="job-info"]')
            job_detail_url = job.xpath('h3/a/@href').extract_first(
                default='Null').strip()
            if job_detail_url != 'Null':
                if job_detail_url.startswith('http'):
                    yield Request(job_detail_url, headers=self.headers)
                elif job_detail_url.startswith('/'):
                    yield Request('%s%s' %
                                  ('https://www.liepin.com', job_detail_url),
                                  headers=self.headers)
            # 产生下一页地址
            next_xpath = '//div[@class="pagerbar"]/a[@class="current"]/following-sibling::a[1]'
            # 下一页不是disable了,说明还没头
            if response.xpath(next_xpath +
                              '/text()').extract_first().strip() != u'下一页':
                yield Request('https://www.liepin.com' +
                              response.xpath(next_xpath +
                                             '/@href').extract_first().strip(),
                              headers=self.headers)

        one_job = "/html/body/div[@id='job-view-enterprise']"
        if response.xpath(one_job):
            item = LiepinItem()
            list_link = response.xpath(
                "//link[@rel='canonical']/@href").extract()
            item['link'] = "".join(list_link).strip()
            item['job_name'] = response.xpath(
                "/html//div[@id='job-view-enterprise']//div[@class='title-info']/h1/text()"
            ).extract_first().strip()
            item['salary'] = response.xpath(
                "/html//p[@class='job-item-title']/text()").extract_first(
                ).strip()
            item['address'] = response.xpath(
                "/html//div[@class='job-title-left']/p[@class='basic-infor']/span[1]/a/text()"
            ).extract_first().strip()
            item['education'] = response.xpath(
                "/html//div[@class='job-qualifications']/span[1]/text()"
            ).extract_first().strip()
            item['experience'] = response.xpath(
                "/html//div[@class='job-qualifications']/span[2]/text()"
            ).extract_first().strip()
            item['job_require'] = response.xpath(
                "/html//div[@class='about-position']/div[@class='job-item main-message job-description'][1]/div[@class='content content-word']"
            ).extract_first().strip()
            item['company_size'] = response.xpath(
                "//ul[@class='new-compintro']/li[2]/text()").extract_first(
                ).strip()
            item['company_name'] = response.xpath(
                "/html//div[@class='title-info']/h3/a/text()").extract_first(
                ).strip()
            yield item

        one_hunter_job = "/html/body/div[@id='job-hunter']"
        if response.xpath(one_hunter_job):
            item = LiepinItem()
            job_name_raw = response.xpath(
                "/html/body/div[@id='job-hunter']/div[@class='wrap clearfix']/div[@class='clearfix content']/div[@class='main ']/div[@class='about-position']/div[@class='title']/div[@class='title-info']/h1/text()"
            ).extract_first()
            if job_name_raw is None:
                job_name_raw = response.xpath(
                    "/html/body/div[@id='job-hunter']/div[@class='wrap clearfix']/div[@class='clearfix content']/div[@class='main']/div[@class='about-position']/div[@class='title']/div[@class='title-info ']/h1/text()"
                ).extract_first()
            item['job_name'] = job_name_raw.strip()
            item['salary'] = response.xpath(
                "/html/body/div[@id='job-hunter']//"
                "p[@class='job-main-title']/text()").extract_first().strip()
            item['education'] = response.xpath(
                "/html/body/div[@id='job-hunter']//div[@class='resume clearfix']/span[1]/text()"
            ).extract_first().strip()
            item['experience'] = response.xpath(
                "/html/body/div[@id='job-hunter']//div[@class='resume clearfix']/span[2]/text()"
            ).extract_first().strip()
            item['job_require'] = response.xpath(
                "/html/body/div[@id='job-hunter']//div[@class='content content-word']"
            ).extract_first().strip()
            yield item
Esempio n. 11
0
    def parse(self, response):
        sel = Selector(response)
        current_url = response.url
        # print response.body
        # 详情页分析
        for detail_link in CONFIG['detail_link_rule']:

            if is_match(current_url, detail_link):
                print 'match detail_link page!', current_url
                item = LiepinItem()
                # items接收list,防止出现index out of range

                div_about_position = sel.xpath(
                    '//div[@class="about-position"]')
                # div_right_blcok_post = sel.xpath('//div[@class="right_blcok_post"]')

                item[u'hire_website'] = 'NULL'
                item[u'company_name'] = div_about_position.xpath(
                    '//h3/a/text()').extract()[0]  # [0]
                item[u'job_name'] = div_about_position.xpath(
                    '//h1/text()').extract()[0]  # [0]

                # judge if key terms in the job_name
                flag = False
                for term in data_related_terms:
                    if term in item[u'job_name']:
                        flag = True
                if flag is False:
                    return

                company_industry = sel.xpath(
                    '//div[@class="company-infor"]/ul/li/a/text()').extract()
                if company_industry is None or len(company_industry) is 0:
                    item[u'job_category'] = sel.xpath(
                        '//div[@class="company-infor"]/ul/li/text()').extract(
                        )[0].strip()
                else:
                    item[u'job_category'] = company_industry[0].strip()

                ul_other_info_text = div_about_position.xpath(
                    u'//h3[text()="其他信息:"]/following::div[1]/ul/li/label/text()'
                ).extract()
                item[u'department'] = ul_other_info_text[0]
                item[u'location'] = div_about_position.xpath(
                    '//p[@class="basic-infor"]/span[1]/a/text()').extract(
                    )[-1].strip()  # [-1]
                item[u'job_nature'] = u'全职'

                requirement = div_about_position.xpath(
                    '//div[@class="job-qualifications"]/span/text()').extract(
                    )
                item[u'experience'] = requirement[1]
                item[u'education'] = requirement[0]

                item[u'salary'] = div_about_position.xpath(
                    '//p[@class="job-item-title"]/text()').extract()[0].strip(
                    )  # [0]
                item[u'salary'] = process_salary(item[u'salary'])

                item[u'major'] = ul_other_info_text[1]

                item[u'hire_num'] = 'NULL'

                description = div_about_position.xpath(
                    u'//h3[text()="职位描述:"]/following::div[1]/text()').extract(
                    )

                item[u'temptation'] = div_about_position.xpath(
                    u'//div[@class="tag-list"]/span[@class="tag"]/text()'
                ).extract()
                item[u'temptation'] = ','.join(item[u'temptation'])

                item[u'description'] = "\n".join(description)

                ul_company_detail_text = sel.xpath(
                    '//div[@class="company-infor"]/ul/li/text()').extract()
                # print ul_company_detail_text
                item[u'industry'] = sel.xpath(
                    '//div[@class="company-infor"]/ul/li[1]/a/text()').extract(
                    )[0]
                item[u'company_nature'] = ul_company_detail_text[-1].strip()
                item[u'finance'] = 'NULL'
                for li_text in ul_company_detail_text:
                    if u'人' in li_text:
                        item[u'staff_num'] = li_text.strip()
                    elif u'轮' in li_text:
                        item[u'finance'] = li_text.strip()

                item[u'company_website'] = 'NULL'
                item[u'publish_date'] = div_about_position.xpath(
                    '//p[@class="basic-infor"]/span[2]/text()').extract(
                    )[-1].strip()  # [-1]
                item[u'publish_date'] = get_publish_time(item[u'publish_date'])
                item[u'publish_website'] = 'NULL'
                item[u'original_url'] = current_url

                # item['requirement'] = div_about_position.xpath('//div[@class="job-qualifications"]/span/text()').extract()[0]
                # item['report_to'] = ul_other_info_text[2] 汇报对象
                # item['subordinates_num'] = ul_other_info_text[3] 下属人数
                # company_website = div_about_position.xpath('//h3/a/@href').extract()
                # item['company_website'] = 'NULL' if len(company_website) == 0 else company_website[0]
                # item['company_address'] = sel.xpath('//div[@class="company-infor"]/p/text()').extract()[-1]

                print item.__dict__
                yield item

        # 过滤出所有的列表页和详情页进行回调。
        for url in sel.xpath('//a/@href').extract():
            url = urljoin(current_url, url)
            # print url
            for list_link in CONFIG['list_link_rule']:
                if is_match(url, list_link):
                    # print 'match list_link page! ', url
                    # log.msg('list_url: %s' % url, level=log.INFO)
                    yield Request(url, callback=self.parse)

            for detail_link in CONFIG['detail_link_rule']:
                if is_match(url, detail_link):
                    print 'match detail_link page!', url
                    # log.msg('detail_url: %s' % url, level=log.INFO)
                    yield Request(url, callback=self.parse)