Ejemplo n.º 1
0
    def jobdetail(self, response):
        # inspect_response(response, self)
        item = JobsearchItem()
        item['postn_name'] = str(response.meta['post']['jobName'])
        item['postn_web'] = str('https:' + response.meta['post']['jobHref'])
        item['postn_area'] = str(response.meta['post']['pubCity'])
        item['postn_adds'] = str(
            response.xpath("//div [@class='ads-msg']/span/text()").extract())
        item['postn_salary'] = str(response.meta['post']['salary'])
        if 'pubEx' in response.meta['post'].keys():
            item['postn_experience'] = str(response.meta['post']['pubEx'])
        else:
            item['postn_experience'] = 'null'

        if 'pubEdu' in response.meta['post'].keys():
            item['postn_edu'] = str(response.meta['post']['pubEdu'])
        else:
            item['postn_edu'] = 'null'

        item['postn_numHire'] = str(
            response.xpath("//li [@class='recruiting']/span/text()").re(
                "(\d+)\xa0人"))
        item['postn_benifit'] = str(
            response.xpath(
                "//div [@class='job-msg-bottom']/ul/li/text()").extract())
        item['com_name'] = str(response.meta['post']['compName'])
        item['com_web'] = 'null'
        if 'industryName' in response.meta['post'].keys():
            item['com_simInfo'] = str(response.meta['post']['industryName'])
        else:
            item['com_simInfo'] = 'null'

        item['post_date'] = 'null'
        item['resource'] = self.name
        yield item
Ejemplo n.º 2
0
 def jobdetail(self, response):
     # inspect_response(response, self)
     if response.status == 200:
         benifit = []
         item = JobsearchItem()
         item['postn_name'] = str(response.meta['post']['positionName'])
         item['postn_web'] = str('https://www.lagou.com/jobs/' + str(response.meta['post']['positionId']) + '.html')
         item['postn_area'] = str(response.meta['post']['city'])
         tem = response.xpath("//div [@class='work_addr']/a/text()").extract()
         tem.remove(tem[-1])
         item['postn_salary'] = str(response.meta['post']['salary'])
         item['postn_experience'] = str(response.meta['post']['workYear'])
         item['postn_edu'] = str(response.meta['post']['education'])
         item['postn_numHire'] = 'Null'
         benifit.extend(response.meta['post']['companyLabelList'])
         benifit.append(response.meta['post']['positionAdvantage'])
         item['postn_benifit'] = str(benifit)
         item['com_name'] = str(response.meta['post']['companyFullName'])
         item['com_web'] = str(response.xpath("//i [@class='icon-glyph-home']/following-sibling::a/@href").extract())
         item['com_simInfo'] = str(response.meta['post']['industryField'])
         item['post_date'] = str(response.meta['post']['formatCreateTime'])
         item['resource'] = self.name
         yield item
     elif response.status == 302:
         # inspect_response(response, self)
         reRequest = response.request.copy()
         reRequest.dont_filter = True
         yield reRequest
Ejemplo n.º 3
0
    def parse_job(self, response):
        item_loader = ItemLoader(item=JobsearchItem(), response=response)
        for i in range(5, 58):

            company = item_loader.add_xpath(
                "company_name",
                "normalize-space(/html/body/div[2]/div[4]/div[%i]/span[1]/a/text())"
                % i)
            job = item_loader.add_xpath(
                "job_name",
                "normalize-space(/html/body/div[2]/div[4]/div[%i]/p/span/a/text())"
                % i)
            working = item_loader.add_xpath(
                "working_city",
                "normalize-space(/html/body/div[2]/div[4]/div[%i]/span[2]/text())"
                % i)
            salary = item_loader.add_xpath(
                "salary",
                "normalize-space(/html/body/div[2]/div[4]/div[%i]/span[3]/text())"
                % i)
            time = item_loader.add_xpath(
                "release_time",
                "normalize-space(/html/body/div[2]/div[4]/div[%i]/span[4]/text())"
                % i)

            job_item = item_loader.load_item()
            return job_item
Ejemplo n.º 4
0
    def parse_infos(self, response):
        item = JobsearchItem()
        item['job_link'] = response.url
        item['job_name'] = response.xpath(
            '//div[@class="cn"]/h1/@title').extract_first()
        item['job_city'] = response.xpath(
            '//div[@class="cn"]/span[@class="lname"]/text()').extract_first()
        item['salary'] = response.xpath(
            '//div[@class="cn"]/strong/text()').extract_first()
        item['gs_name'] = response.xpath(
            '//div[@class="cn"]/p[@class="cname"]/a/@title').extract_first()
        item['gs_link'] = response.xpath(
            '//div[@class="cn"]/a/@href').extract_first()
        msg = response.xpath(
            '//p[contains(@class,"msg") and contains(@class,"ltype")]/text()'
        ).extract_first().strip()
        item['gs_msg'] = re.sub("\s", "", msg)
        item['gs_fl'] = response.xpath(
            '//p[@class="t2"]/span/text()').extract()
        # 学历要求,可能不存在,这个必须先找到子节点,然后往回找父节点
        item['req_xl'] = response.xpath(
            '//em[@class="i2"]/../text()').extract_first()
        # 经验要求,可能不存在
        item['req_jy'] = response.xpath(
            '//em[@class="i1"]/../text()').extract_first()
        item['create_date'] = response.xpath(
            '//em[@class="i4"]/../text()').extract_first()
        infos = response.xpath(
            '//div[contains(@class,"job_msg")]/text()').extract()
        item['job_info'] = re.sub("\s", "", "".join(infos))
        address = response.xpath('//p[@class="fp"]/text()').extract()
        item['address'] = re.sub("\s", "", "".join(address))

        yield item
Ejemplo n.º 5
0
    def jobdetail(self, response):
        # inspect_response(response, self)
        item = JobsearchItem()

        def postInfo(info):
            post = {
                'experience': 'null',
                'edu': 'null',
                'numhire': 'null',
                'postdate': 'null'
            }
            for i in info:
                if '发布' in i:
                    post['postdate'] = i.strip()
                elif '经验' in i:
                    post['experience'] = i.strip()
                elif '招' in i:
                    post['numhire'] = i.strip()
                else:
                    post['edu'] = i.strip()
            return post

        postInfom = postInfo(
            response.xpath("//p [@class='msg ltype']/text()").extract()[1:])
        item['postn_name'] = str(response.xpath("//h1/@title").extract())
        item['postn_web'] = str(response.url)
        item['postn_area'] = str(
            response.xpath("//p [@class='msg ltype']/text()").extract()
            [0].strip())
        item['com_name'] = str(
            response.xpath("//p [@class='cname']/a/@title").extract())
        item['com_web'] = str(
            response.xpath("//p [@class='cname']/a/@href").extract())
        item['com_simInfo'] = str(
            response.xpath("//div [@class='com_tag']/p/@title").extract())
        item['post_date'] = str(postInfom['postdate'])
        item['postn_adds'] = str(
            response.xpath("//div [@class='bmsg inbox']/p//text()").extract()
            [2].strip())
        item['postn_salary'] = str(
            response.xpath("//strong/text()").extract()[1])
        item['postn_experience'] = str(postInfom['experience'])
        item['postn_edu'] = str(postInfom['edu'])
        item['postn_numHire'] = str(postInfom['numhire'])
        item['postn_benifit'] = str(
            response.xpath("//div [@class='t1']/span/text()").extract())
        item['resource'] = self.name
        yield item

        @classmethod
        def from_crawler(cls, crawler, *args, **kwargs):
            spider = super(Job51Spider,
                           cls).from_crawler(crawler, *args, **kwargs)
            crawler.signals.connect(spider.spider_closed,
                                    signal=signals.spider_closed)
            return spider

        def spider_closed(self, spider):
            spider.logger.info('Spider closed: %s', spider.name)
Ejemplo n.º 6
0
    def parse(self, response):
        item = JobsearchItem()
        for h in response.css('.e.eck'):
            item['title'] = h.css('span::text').extract_first()
            yield item

        for href in response.css('a.next').xpath('./@href'):
            yield response.follow(href, callback=self.parse)
Ejemplo n.º 7
0
    def jobDetail(self, response):
        # inspect_response(response, self)

        def postInfo(info):
            post = {'city': 'null', 'experience': 'null', 'edu': 'null'}
            for i in info:
                if '城市' in i:
                    post['city'] = getIt(i.strip())
                elif '经验' in i:
                    post['experience'] = getIt(i.strip())
                else:
                    post['edu'] = getIt(i.strip())
            return post

        def getIt(info):
            return info[info.find(':') + 1:]

        def comInfor(*info):
            info = list(chain.from_iterable(info))
            company = {'info': 'null', 'web': 'null'}
            tem = []
            for i in info:
                if ('http' or 'www' or 'com') in i:
                    company['web'] = i
                else:
                    tem.append(i)
            company['info'] = tem
            return company

        item = JobsearchItem()

        post = postInfo(
            response.xpath("//div [@class='job-primary detail-box']/div [@class='info-primary']/p/text()").extract())
        company = comInfor(
            response.xpath("//div [@class='info-company']/p/text()").extract(),
            response.xpath("//div [@class='info-company']/p/a/text()").extract()
        )

        item['postn_name'] = str(response.xpath(
            "//div [@class='job-primary detail-box']/div [@class='info-primary']/div [@class='name']/h1/text()").extract())
        item['postn_web'] = str(response.request.url)
        item['postn_area'] = str(post['city'])
        item['postn_adds'] = str(response.xpath("//div [@class='location-address']/text()").extract())
        item['postn_salary'] = str(response.xpath(
            "normalize-space(//div [@class='job-primary detail-box']/div [@class='info-primary']/div [@class='name']/span/text())").extract())
        item['postn_experience'] = str(post['experience'])
        item['postn_edu'] = str(post['edu'])
        item['postn_numHire'] = 'null'
        item['postn_benifit'] = str(response.xpath(
            "//div [@class='detail-content']/div [@class='job-sec']/div [@class='job-tags']/span/text()").extract())
        item['com_name'] = str(response.xpath("//div [@class='info-company']/h3/a/text()").extract())
        item['com_web'] = str(company['web'])
        item['com_simInfo'] = str(company['info'])
        item['post_date'] = str(response.xpath("//div [@class='job-author']/span/text()").re(r"发布于(.*)"))
        item['resource'] = self.name
        yield item
Ejemplo n.º 8
0
    def jobdetail(self, response):
        # inspect_response(response, self)

        def comInfo(info):
            company = {'info': 'null', 'addrs': 'null'}
            tem = []
            for i in info:
                if '公司地址' in i:
                    company['addrs'] = i.strip()
                elif '行业' in i:
                    tem.append('行业:' + response.xpath("//ul [@class='new-compintro']/li/a/text()").extract()[0])
                else:
                    if len(i.strip()) > 0:
                        tem.append(i.strip())
            company['info'] = tem
            return company

        item = JobsearchItem()

        companyinfo = comInfo(response.xpath("//ul [@class='new-compintro']/li/text()").extract())
        if 'https://www.liepin.com/job/' in response.request.url:
            item['postn_name'] = str(response.xpath("//div [@class='title-info']/h1/@title").extract())
            item['postn_web'] = str(response.request.url)
            item['postn_area'] = str(response.xpath("//p [@class='basic-infor']/span/a/text()").extract())
            item['postn_adds'] = str(companyinfo['addrs'])
            item['postn_salary'] = str(response.xpath("normalize-space(//p [@class='job-item-title']/text())").extract())
            item['postn_experience'] = str(response.xpath("//div [@class='job-qualifications']/span/text()").extract()[1])
            item['postn_edu'] = str(response.xpath("//div [@class='job-qualifications']/span/text()").extract()[0])
            item['postn_numHire'] = 'null'
            item['postn_benifit'] = str(response.xpath("//div [@class='tag-list']/span/text()").extract())
            item['com_name'] = str(response.xpath("//div [@class='title-info']/h3/a/@title").extract())
            item['com_web'] = str(response.xpath("//div [@class='title-info']/h3/a/@href").extract())
            item['com_simInfo'] = str(companyinfo['info'])
            item['post_date'] = str(response.xpath("normalize-space(//p [@class='basic-infor']/time/text())").extract())

        else:

            item['postn_name'] = str(response.xpath("//div [@class='title-info ']/h1/@title").extract())
            item['postn_web'] = str(response.request.url)
            item['postn_area'] = str(response.xpath("normalize-space(//p [@class='basic-infor']/span/text())").extract())
            item['postn_adds'] = str(companyinfo['addrs'])
            item['postn_salary'] = str(response.xpath("normalize-space(//p [@class='job-main-title']/text())").extract())
            item['postn_experience'] = str(response.xpath("//div [@class='resume clearfix']/span/text()").extract()[1])
            item['postn_edu'] = str(response.xpath("//div [@class='resume clearfix']/span/text()").extract()[0])
            item['postn_numHire'] = 'null'
            item['postn_benifit'] = str(response.xpath("//div [@class='tag-list']/span/text()").extract())
            item['com_name'] = str(response.xpath("//p [@class='company-name']/@title").extract())
            item['com_web'] = str(response.xpath("//div [@class='title-info']/h3/a/@href").extract())
            item['com_simInfo'] = str(companyinfo['info'])
            item['post_date'] = str(response.xpath("normalize-space(//p [@class='basic-infor']/time/text())").extract())
        item['resource'] = self.name
        yield item
Ejemplo n.º 9
0
    def parseDetail(self, response):
        item = JobsearchItem()
        # item['detail'] = response.xpath('//article').xpath('string(.)').extract_first().strip()
        # same as next
        item['detail'] = response.xpath(
            'string(//article)').extract_first().strip()
        item['salary'] = response.xpath(
            'string(//p[@class="jp"])').extract_first()
        item['experience'] = response.xpath(
            'string(//span[@class="s_n"])').extract_first()
        item['area'] = response.xpath(
            'string(//div[@class="jt"]/em)').extract_first()

        yield item
Ejemplo n.º 10
0
    def jobdetail(self, response):
        # inspect_response(response, self)

        item = JobsearchItem()
        item['postn_name'] = str(
            response.xpath(
                "//div [@class='base_info']/div/h1/span/text()").extract())
        item['postn_web'] = str(response.request.url)
        item['postn_area'] = str(
            response.xpath(
                "//div [@class='job_require']/span [@class='job_loc']/text()").
            extract())
        item['postn_adds'] = 'null'
        item['postn_salary'] = str(
            response.xpath("//span [@class='job_price']/text()").extract())
        item['postn_experience'] = str(
            response.xpath("//span [@class='job_exp']/text()").extract())
        item['postn_edu'] = str(
            response.xpath(
                "//div [@class='job_require']/span/text()").extract()[3])
        item['postn_numHire'] = 'null'
        item['postn_benifit'] = str(
            response.xpath(
                "//div [@class='job_fit_tags']/ul/li/text()").extract())
        item['com_name'] = str(
            response.xpath(
                "//div [@class='company_intro  jpadding mt15']/h4/a/text()").
            extract())
        item['com_web'] = str(
            response.xpath(
                "//div [@class='company_intro  jpadding mt15']/h4/a/@href").
            extract())
        item['com_simInfo'] = str(
            response.xpath(
                "//div [@class='compny_tag']/span/text()").extract())
        item['post_date'] = str(
            response.xpath("//p [@class='updatetime']/text()").extract())
        item['resource'] = self.name
        yield item