Python BossItem Beispiele, boss.items.BossItem Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: bossspider.py Projekt: ZiMingJ/Python_Crawler

    def parse(self, response):
        item = BossItem()
        node_list = response.xpath('//div[@class="job-list"]/ul/li')
        for node in node_list:
            item['job'] = node.xpath(
                './/div[@class="info-primary"]//div[@class="job-title"]/text()'
            ).extract()
            salary = node.xpath('.//span[@class="red"]/text()')[0].extract()
            if len(salary) > 6:
                salary = salary[0:6]

            a, b = re.split("-", salary)
            b1 = b.replace("K", "")
            c = []
            c.append(str((int(a) + int(b1)) / 2))
            item['salary'] = c
            item['company'] = node.xpath(
                './/div[@class="job-primary"]/div[@class="info-company"]//a/text()'
            ).extract()
            print(item['job'], item['salary'], item['company'])
            yield item  # 吧item对象传递到下一个pipeline
        #爬取下一页
        self.page += 1
        if response.xpath(
                '//div[@class="page"]/a[@href="/c100010000-p100109/?page={0}"]'
                .format(
                    self.page)).extract_first() is not None and self.page <= 2:
            nextpage = 'https://www.zhipin.com/c100010000-p100109/?page={0}'.format(
                self.page)
            yield scrapy.Request(nextpage, callback=self.parse)  # 函数指针

Beispiel #2

0

Datei anzeigen

 def parse(self, response):
     li_list = response.xpath('//div[@class="job-list"]/ul/li')
     # 用于递归终止条件
     next_href = response.xpath(
         '//div[@class="page"]/a[@ka="page-next"]/@href').extract_first()
     for li in li_list:
         job_title = li.xpath(
             './div/div[1]/h3/a/div[1]/text()')[0].extract()
         salary = li.xpath('./div/div[1]/h3/a/span/text()')[0].extract()
         url = 'https://www.zhipin.com' + li.xpath(
             './div/div[1]/h3/a/@href').extract_first()
         company = li.xpath(
             './div/div[2]/div[1]/h3/a/text()').extract_first()
         release_time = li.xpath('./div/div[3]/p/text()').extract_first()
         item = BossItem()
         item['job_title'] = job_title
         item['salary'] = salary
         item['url'] = url
         item['company'] = company
         item['release_time'] = release_time
         yield item
     if next_href:
         # 如果next标签没有href了就终止
         self.pageNum += 1  # 从第二页开始
         new_url = self.url + str(self.pageNum)
         # 回调函数
         yield scrapy.Request(url=new_url, callback=self.parse)

Beispiel #3

0

Datei anzeigen

Datei: boss_spider.py Projekt: 476310982/Hello_python

 def parse_job(self, response):
     print(response.text)
     name = response.xpath('//div[@class="info-primary"]/div/h1/text()'
                           ).extract_first().strip()
     salary = response.xpath(
         '//div[@class="info-primary"]/div/span/text()').extract().strip()
     infos = response.xpath(
         '//div[contains(@class,"job-primary")]/div[@class="info-primary"]/p/text()'
     ).extract()
     city = infos[0]
     work_years = infos[1]
     education = infos[2]
     company = response.xpath(
         '//div[@class="sider-company"]//div[@class="company-info"]/a[1]/@title'
     ).extract().strip()
     item = BossItem(name=name,
                     salary=salary,
                     city=city,
                     work_years=work_years,
                     education=education,
                     company=company)
     print('=' * 60)
     print(name, salary, company)
     print('=' * 60)
     yield item

Beispiel #4

0

Datei anzeigen

    def parse_job(self, response):
        # xpath语法，提取网页信息
        name = response.xpath("//div[@class='name']/h1/text()").get()
        name = name.strip() if name else '无'

        salary = response.xpath("//span[@class='salary']/text()").get()
        salary = salary.strip() if salary else '无'

        job_info = response.xpath(
            '//div[contains(@class, "job-primary ")]/div[@class="info-primary"]/p//text()'
        ).getall()
        if job_info:
            city = job_info[0]
            work_years = job_info[1]
            education = job_info[2]
        else:
            city = '无'
            work_years = '无'
            education = '无'
        company = response.xpath(
            "//div[@class='info-company']//a/text()").get()
        company = company.strip() if company else '无'
        yield BossItem(name=name,
                       salary=salary,
                       city=city,
                       work_years=work_years,
                       education=education,
                       company=company)

Beispiel #5

0

Datei anzeigen

 def parse_job(self, response):
     doc = pq(response.text)
     item_loader = BossItemLoader(item=BossItem(), response=response)
     try:
         item_loader.add_value('job_tag', re.findall('query=(.*?)&', self.start_urls))
         item_loader.add_value('url_id', get_md5(response.url))
         item_loader.add_value('city', doc('.text-city').text())
         item_loader.add_value('job_title', doc('title').text())
         item_loader.add_value('job_describe', transfer_json(doc('.job-sec .text').text()))
         item_loader.add_value('job_address', doc('.location-address').text())
         item_loader.add_value('job_url', response.url)
         item_loader.add_value('job_createtime', doc('.sider-company .gray').text().split('：')[-1])
         item_loader.add_value('salary', str(doc('.salary').text().strip('K').split('-')))
         if '·' in doc('.salary').text():
             item_loader.add_value('salary_multiple', doc('.salary').text().strip('薪').split('·')[-1])
         item_loader.add_value('company', doc('.job-sec .name').text())
         item_loader.add_value('company_createtime', doc('.level-list .res-time').text().split('：')[-1])
         item_loader.add_value('company_registered_fund',
                               re.match('.*注册资金：(.*)万', doc('.level-list').text(), re.S).group(1) if re.match(
                                   '.*注册资金：(.*)万', doc('.level-list').text(), re.S) else None)
         item_loader.add_value('company_people', re.findall('\d.*人', doc('.sider-company p').text()))
         item_loader.add_value('company_industry', doc('.sider-company a[ka=job-detail-brandindustry]').text())
         item_loader.add_value('company_describe', doc('.job-sec.company-info .text').text())
         item_loader.add_value('create_time', datetime.now().strftime('%Y-%m-%d %X'))
     except Exception as e:
         with open(log_path + str(datetime.now().strftime('%Y-%m%d %X') + '.html'), 'w', encoding='utf-8') as f:
             f.write(response.text)
             f.write('\r\n')
             f.write(traceback.format_exc())
             f.write('\r\n')
             f.write(str(e))
     finally:
         job_item = item_loader.load_item()
         return job_item

Beispiel #6

0

Datei anzeigen

Datei: zhipin.py Projekt: guishen2017/boss

 def parse_job(self, response):
     position_name = response.xpath('//div[@class="name"]/h1/text()').get()
     salary = response.xpath(
         '//div[@class="info-primary"]//span[@class="badge"]/text()').get()
     info = response.xpath(
         '//div[@class="job-primary detail-box"]//div[@class="info-primary"]/p/text()'
     ).getall()
     city, work_experience, education = list(
         map(lambda x: x.split("：")[1], info))
     tags = ",".join(
         response.xpath('//div[@class="job-tags"]/span/text()').getall())
     describe = response.xpath(
         '//div[@class="job-sec"]/div[@class="text"]//text()').getall()
     describe = ",".join(describe).replace("\n", "").strip()
     company_describe = response.xpath(
         '//div[@class="job-sec company-info"]/div[@class="text"]/text()'
     ).getall()
     company_describe = ",".join(company_describe).replace("\n", "").strip()
     info_content = ",".join(
         response.xpath(
             '//div[@class="job-sec"]//div[@class="level-list"]//text()').
         getall())
     information = re.sub(r"[\n\s,，]", "", info_content)
     work_location = response.xpath(
         '//div[@class="location-address"]/text()').get()
     company_name = response.xpath('//h3[@class="name"]/a/text()').get()
     company_url = response.xpath(
         '//div[@class="info-company"]/p[last()]/text()').get()
     item = BossItem(position_name=position_name,salary=salary,city=city,work_experience=work_experience, \
                     education=education,tags=tags, describes=describe,company_describe=company_describe, \
                     information=information,work_location=work_location,company_name=company_name,\
                     company_url=company_url, url=response.url, url_object_id = get_md5(response.url))
     yield item

Beispiel #7

0

Datei anzeigen

 def parse_boss(self, response):
     job_name = response.xpath(
         "//div[@class='job-banner']//div[@class='name']/h1/text()").get(
         ).strip()
     company = response.xpath(
         "//div[@class='detail-content']/div[4]//div[@class='name']/text()"
     ).get().strip()
     salary = response.xpath(
         "//div[@class='job-banner']//div[@class='name']/span/text()").get(
         ).strip()
     working_address = response.xpath(
         "//div[@class='job-banner']//div[@class='info-primary']/p//text()"
     ).getall()[0]
     working_age = response.xpath(
         "//div[@class='job-banner']//div[@class='info-primary']/p//text()"
     ).getall()[1]
     education = response.xpath(
         "//div[@class='job-banner']//div[@class='info-primary']/p//text()"
     ).getall()[2]
     detail = response.xpath(
         "//div[@class='job-box']//div[@class='job-detail']//div[@class='text']/text()"
     ).get()
     item = BossItem(job_name=job_name,
                     company=company,
                     salary=salary,
                     working_address=working_address,
                     working_age=working_age,
                     education=education,
                     detail=detail)
     print(item)
     yield item

Beispiel #8

0

Datei anzeigen

 def parse_job(self, response):
     title = response.xpath("//h1[@class= 'name']/text()").get().strip()
     salary = response.xpath("//h1[@class = 'name']/span/text()").get().strip()
     job_info = response.xpath("//div[@class= 'job_primary']/div[@class= 'info-primary']/p//text()").get().strip()
     city = job_info[0]
     work_year = job_info[1]
     education = job_info[2]
     item = BossItem(title=title, salary=salary, city=city, work_year=work_year, education=education)
     yield item

Beispiel #9

0

Datei anzeigen

Datei: zhipin.py Projekt: ZHAISHENKING/scrapy-redis

 def parse(self, response):
     posts = response.css('div.post-meta')[:10]
     for post in posts:
         item = BossItem()
         item['link'] = post.css(
             'a.archive-title::attr(href)').extract_first()
         item['name'] = post.css('a.archive-title::text').extract_first()
         print(item)
         yield item

Beispiel #10

0

Datei anzeigen

    def parse(self, response):
        # print(response, type(response))
        # from scrapy.http.response.html import HtmlResponse
        # print(response.body_as_unicode())
        #
        # current_url = response.url  # 爬取时请求的url
        # body = response.body  # 返回的html
        # unicode_body = response.body_as_unicode()  # 返回的html unicode编码
        # print unicode_body

        # md5_obj = hashlib.md5()
        # md5_obj.update(response.url)
        # md5_url = md5_obj.hexdigest()
        hxs = HtmlXPathSelector(response)
        if response.url in Demo.url_over_set:
            pass
        else:
            Demo.url_over_set.add(response.url)
            #获取数据
            items = hxs.xpath('//div[@class="job-list"]/ul/li'
                              )  # select中填写查询目标，按scrapy查询语法书写
            for item in items:
                bossItem = BossItem()
                bossItem['jobName'] = item.xpath(
                    './div[@class="job-primary"]/div[@class="info-primary"]/h3/a/text()'
                ).extract()[0]
                bossItem['salary'] = item.xpath(
                    './div[@class="job-primary"]/div[@class="info-primary"]/h3/a/span/text()'
                ).extract()[0]
                bossItem['companyName'] = item.xpath(
                    './div[@class="job-primary"]/div[@class="info-company"]//h3/a/text()'
                ).extract()[0]
                bossItem['city'] = item.xpath(
                    './div[@class="job-primary"]/div[@class="info-primary"]/p/text()'
                ).extract()[0]
                bossItem['life'] = item.xpath(
                    './div[@class="job-primary"]/div[@class="info-primary"]/p/text()'
                ).extract()[1]
                bossItem['education'] = item.xpath(
                    './div[@class="job-primary"]/div[@class="info-primary"]/p/text()'
                ).extract()[2]
                bossItem['skill'] = item.xpath(
                    './div[@class="job-tags"]/span/text()').extract()
                bossItem['time'] = item.xpath(
                    './div[@class="job-time"]/span/text()').extract()[0]
                yield bossItem
            #完了后获取地址
            page_url = hxs.select('//div[@class="page"]/a/@href').extract()
            for url in page_url:
                url = "http://" + Demo.allowed_domains[0] + url
                if not 'javascript' in url and url not in Demo.url_over_set:
                    Demo.url_set.add(url)
            print Demo.url_set
            next_url = Demo.url_set.pop()
            time.sleep(1)
            yield Request(next_url, callback=self.parse)

Beispiel #11

0

Datei anzeigen

Datei: zhipin.py Projekt: xuyanbo03/python-demo

 def parse_job(self, response):
     title = response.xpath("//div[@class='job-banner']//div[@class='name']/h1/text()").get().strip()
     salary = response.xpath("//div[@class='job-banner']//div[@class='name']/span/text()").get().strip()
     job_info = response.xpath("//div[@class='job-banner']//p//text()").getall()
     city = job_info[0]
     work_years = job_info[1]
     education = job_info[2]
     company = response.xpath("//div[@class='sider-company']//div[@class='company-info']/a[1]/@title").get().strip()
     item = BossItem(name=title, salary=salary, city=city, work_years=work_years, education=education,company=company)
     yield item

Beispiel #12

0

Datei anzeigen

 def parse_job(self, response):
     name = response.xpath("//h1[@class='name']/text()").get().strip()
     salary = response.xpath("//h1[@class='name']/span/text()").get().strip()
     job_info = response.xpath("//div[@class='job-primary']/div[@class='info-primary']/p//text()").getall()
     city = job_info[0]
     work_years = job_info[1]
     education = job_info[2]
     company = response.xpath("//div[@class='info-company']//a/text()").get()
     item = BossItem(name=name,salary=salary,city=city,work_years=work_years,education=education,company=company)
     yield item

Beispiel #13

0

Datei anzeigen

 def parse_item(self, response):
     item = BossItem()
     try:
         item["title"] = response.xpath(
             '//div[@class="name"]/h1/text()').extract()[0]
         #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
         #item['name'] = response.xpath('//div[@id="name"]').get()
         #item['description'] = response.xpath('//div[@id="description"]').get()
         print(item)
         yield item
     except Exception as e:
         print(e)

Beispiel #14

0

Datei anzeigen

Datei: zhipin.py Projekt: yujunsen/python

    def parse_data(self, response):

        title = response.xpath('//div[@class="name"]/h1/text()').get()
        salary = response.xpath('//span[@class="badge"]/text()').get().strip()
        job_info = response.xpath('//div[@class="job-primary detail-box"]/div[2]/p/text()').getall()
        company = response.xpath('//h3[@class="name"]//text()').get()
        city = job_info[0].split('：')[1]
        work_year = job_info[1].split('：')[1]
        education = job_info[2].split('：')[1]
        item = BossItem(title=title, salary=salary, company=company, city=city, work_year=work_year,
                        education=education)
        yield item

Beispiel #15

0

Datei anzeigen

Datei: boss_zhipin.py Projekt: Z9629/test

 def parse(self, response):
     # item = BossItem()
     body = response.css(".job-primary")
     for head in body:
         item = BossItem()
         item["title"] = head.css(".job-title::text").extract()[0]
         item["wage"] = head.css(".red::text").extract()[0]
         item["site"] = head.css(
             ".info-primary p::text").extract_first().strip()
         item["name"] = head.css(
             ".company-text .name a::text").extract_first()
         yield item
     #翻页
     next_page = response.css(".page .next::attr(href").extract()[0]
     if next_page is not None:
         yield response.follow('https://www.zhipin.com' + next_page,
                               callback=self.parse)

Beispiel #16

0

Datei anzeigen

Datei: zhihu.py Projekt: SamHuang7880/boss

    def next2(self, response):
        # 请求Cookie
        Cookie2 = response.request.headers.getlist('Cookie')
        print(Cookie2)

        #body = response.body  # 获取网页内容字节类型
        #unicode_body = response.body_as_unicode()  # 获取网站内容字符串类型

        #a = response.xpath('/html/head/title/text()').extract()  #得到个人中心页面
        t = response.xpath('//text()').extract()
        txt = re.sub(r'[a-zA-Z",:{}\\.(\)!;%&?$@#><+|*/[\]_=-]', "", str(t))
        txt = txt.replace(" ", "")
        txt = txt.replace("'", "")
        print(txt)
        item = BossItem()
        item['txt'] = txt
        return item

Beispiel #17

0

Datei anzeigen

 def parse_next_page(self, response):
     sel = Selector(response)
     sites = sel.xpath('//div[@class="job-primary"]')
     items = []
     for site in sites:
         item = BossItem()
         item["position_name"] = site.xpath(
             'div[@class="info-primary"]/h3/text()').extract()[0]
         item["salary"] = site.xpath(
             'div[@class="info-primary"]/h3/span/text()').extract()[0]
         item["company"] = site.xpath(
             'div[@class="info-comapny"]/div/h3/text()').extract()[0]
         item["company_type"] = site.xpath(
             'div[@class="info-comapny"]/div/p/text()').extract()[0]
         #items.append(item)
         yield item
         print item["position_name"]

Beispiel #18

0

Datei anzeigen

    def parse(self, response):
        gl_item = response.selector.xpath('//div[@class="job-primary"]')

        for i in gl_item:
            boss = BossItem()
            job_title = ''.join(
                i.xpath('.//div[@class="job-title"]/text()').extract())
            job_price = ''.join(
                i.xpath('.//h3[@class="name"]/a/span/text()').extract())
            comp_name = ''.join(
                i.xpath('.//div[@class="company-text"]/h3/a/text()').extract())
            comp_line = ''.join(
                i.xpath('.//div[@class="company-text"]/p/text()').extract()[0])

            comp_info = i.xpath(
                './/div[@class="company-text"]/p/text()').extract()
            # print(per_num)
            if len(comp_info) == 3:
                per_num = comp_info[2]
            else:
                per_num = comp_info[1]

            publis_name = ''.join(
                i.xpath('.//div[@class="info-publis"]/h3/text()').extract()[0])
            publis_time = ''.join(
                i.xpath('.//div[@class="info-publis"]/p/text()').extract())
            src = ''.join(
                i.xpath('.//a[@class="btn btn-startchat"]/@redirect-url').
                extract())

            #https://www.zhipin.com/geek/new/index/chat?id=1ff819389ed8632e0nVz29S8FFM~

            # job_jianjie = ''.join(i.xpath('.//div[@class="info-primary"]/p/text()'))

            # comp_info = ''.join(i.xpath('.//div[@class="company-text"]/p/text()'))

            boss['job_title'] = job_title
            boss['job_price'] = job_price
            boss['comp_name'] = comp_name
            boss['comp_line'] = comp_line
            boss['per_num'] = per_num
            boss['publis_name'] = publis_name
            boss['publis_time'] = publis_time
            boss['src'] = src

            yield boss

Beispiel #19

0

Datei anzeigen

    def parse_detail(self, response):
        company = response.xpath(
            "//div[@class='company-info']/a[@ka='job-detail-company_custompage']/text()").get().strip()
        position = response.xpath("//div[@class='name']/h1/text()").get()
        salary = response.xpath("//span[@class='salary']/text()").get()
        texts = response.xpath("//div[@class='info-primary']/p/text()").getall()
        city = texts[0]
        experience = texts[1]
        education = texts[2]
        describes = "".join(response.xpath("(//div[@class='job-sec'])[1]/div/text()").getall()).strip()
        origin_url = response.url
        tags = ",".join(response.xpath(
            "(//div[@class='tag-more'])[1]/div[contains(@class, 'tag-all')]//text()").getall()[1:-1]).strip()

        item = BossItem(company=company, position=position, salary=salary, city=city, experience=experience,
                        education=education, describes=describes, tags=tags, origin_url=origin_url)
        yield item

Beispiel #20

0

Datei anzeigen

Datei: zhipin.py Projekt: Samantha09/Scrapy

 def parse_detail(self, response):
     title = response.xpath("//div[@class='name']/h1/text()").get()
     salary = response.xpath("//span[@class='salary']/text()").get().strip()
     company = response.xpath(
         "//div[@class='detail-content']//div[@class='name']/text()").get()
     job_info = response.xpath(
         "//div[@class='info-primary']//p/text()").getall()
     city = job_info[0]
     work_years = job_info[1]
     education = job_info[2]
     item = BossItem(title=title,
                     salary=salary,
                     company=company,
                     city=city,
                     work_years=work_years,
                     education=education)
     yield item

Beispiel #21

0

Datei anzeigen

Datei: zhiping.py Projekt: 520wsl/python-scrapy-test

 def parse_job(self, response):
     name = response.xpath("//div[@class='name']/h1/text()").get().strip()
     salary = response.xpath(
         "//div[@class='name']/span/text()").get().strip()
     job_info = response.xpath(
         "//div[contains(@class,'job-primary')]/div[@class='info-primary']/p[1]/text()"
     ).getall()
     city = job_info[0]
     work_years = job_info[1]
     education = job_info[2]
     company = response.xpath(
         "//div[@class='company-info']/a[last()]/text()").get().strip()
     yield BossItem(name=name,
                    salary=salary,
                    city=city,
                    work_years=work_years,
                    education=education,
                    company=company)

Beispiel #22

0

Datei anzeigen

 def parse_item(self, response):
     name = response.xpath('//div[@class="name"]/h1/text()').get()
     salary = response.xpath(
         '//div[@class="name"]/span/text()').get().strip()
     job_info = response.xpath(
         '//div[@class="info-primary"]/p/text()').getall()
     city = job_info[0]
     worked_years = job_info[1]
     education = job_info[2]
     company = response.xpath(
         '//div[@class="company-info"]/a[1]/@title').get().strip()
     item = BossItem(name=name,
                     salary=salary,
                     city=city,
                     worked_years=worked_years,
                     education=education,
                     company=company)
     return item

Beispiel #23

0

Datei anzeigen

 def parse_job(self, response):
     title = response.xpath("//div[@class='name']/h1/text()").get().strip()
     salary = response.xpath(
         "//div[@class='name']/span/text()").get().strip()
     job_info = response.xpath(
         "//div[@class='job-primary detail-box']/div[@class='info-pro=imary']/p//text()"
     ).getall()
     city = job_info[0]
     work_years = job_info[1]
     education = job_info[2]
     company_name = response.xpath(
         "//div[class='company-info']//a/text()").get().strip()
     item = BossItem(title=title,
                     salary=salary,
                     city=city,
                     work_years=work_years,
                     education=education,
                     company_name=company_name)
     yield item

Beispiel #24

0

Datei anzeigen

 def parse_item(self, response):
     name = response.xpath("//div[@class='name']/h1/text()").get().strip()
     salary = response.xpath(
         "//div[@class='name']/span/text()").get().strip()
     info = response.xpath(
         "//div[@class='job-primary detail-box']//div[@class='info-primary']//p/text()"
     ).getall()
     city = info[0].strip()
     workYears = info[1].strip()
     education = info[2].strip()
     company = response.xpath(
         "//div[@class='company-info']/a/text()").getall()[2].strip()
     boss = BossItem(name=name,
                     salary=salary,
                     jobCity=city,
                     workYear=workYears,
                     education=education,
                     company=company)
     yield boss

Beispiel #25

0

Datei anzeigen

    def parse(self, response):
        jobs = response.css('.job-box .job-list ul li .job-primary')
        for job in jobs:
            item = BossItem()
            item['岗位名称'] = job.css('.job-title::text').extract_first()
            item['薪资范围'] = job.css('.red::text').extract_first()
            item['工作地'] = job.css('.info-primary p::text').extract()[0]
            item['工作经验'] = job.css('.info-primary p::text').extract()[1]
            item['学历要求'] = job.css('.info-primary p::text').extract()[-1]
            item['公司名称'] = job.css(
                '.company-text .name a::text').extract_first()
            item['所属行业'] = job.css('.company-text p::text').extract()[0]
            item['融资阶段'] = job.css('.company-text p::text').extract()[1]
            item['公司规模'] = job.css('.company-text p::text').extract()[-1]
            yield item

        next = response.css('.page .next::attr(href)').extract_first()
        url = response.urljoin(next)
        yield scrapy.Request(url=url, callback=self.parse)

Beispiel #26

0

Datei anzeigen

Datei: zp.py Projekt: Aiyuanjiang/oxmo

 def parse(self, response):
     datas=response.xpath('//div[@class="job-list"]/ul/li').extract()
     item=BossItem()
     for i in range(0,len(datas)):
         data=datas[i]
         item["title"] = re.findall ('<div class="job-title">(.*?)</div>',  data,re.S)[0]
         item["money"] = re.findall ('<span class="red">(.*?)</span>', data, re.S)[0]
         item["loc"] = re.findall ('<p>(.*?)<em class="vline">', data, re.S)[0]
         url = re.findall ('<a href="(/job_detail/.*?)"', data, re.S)[0]
         item["url"] = "https://www.zhipin.com" + url
         item["sb_time"] = re.findall ('em class="vline"></em>(.*?)<em', data, re.S)[0]
         item["school"] = re.findall ('<em class="vline"></em>.*?<em class="vline"></em>(.*?)</p>', data, re.S)[0]
         item["company"] = re.findall ('<h3 class="name"><.*?>(.*?)</a></h3>', data, re.S)[0]
         item["company_rs"] = re.findall ('<em class="vline"></em>.*?<em class="vline"></em>(.*?)</p>', data, re.S)[1]
         # print(item)
         yield item
     next_page=response.xpath('//a[@ka="page-next"]/@href').extract()[0]
     if len(next_page)!=0:
         next_url="https://www.zhipin.com"+next_page
         yield scrapy.Request(next_url,callback=self.parse)

Beispiel #27

0

Datei anzeigen

 def parse_item(self, response):
     name = response.xpath("//h1/text()").get().strip()
     salary = response.xpath("//span[@class='salary']/text()").get().strip()
     job_info = response.xpath(
         "//*[@id='main']/div[1]/div/div/div[2]/p//text()").getall()
     city = job_info[0]
     work_year = job_info[1]
     education = job_info[2]
     positon_info = response.xpath(
         "//div[@class='job-sec']//div[1][@class='text']/text()").get()
     # # positon_info = "".join(positon_info).strip()
     company = response.xpath(
         "//div[@class='company-info']//a[2]/text()").get().strip()
     item = BossItem(name=name,
                     salary=salary,
                     city=city,
                     work_year=work_year,
                     education=education,
                     company=company,
                     positon_info=positon_info)
     yield item

Beispiel #28

0

Datei anzeigen

 def parse_job(self, response):
     title = response.xpath("//div[@class='name']/h1/text()").get().strip()
     salary = response.xpath(
         "//div[@class='name']/span[@class='badge']/text()").get().strip()
     job_info = response.xpath(
         "//div[@class='job-primary detail-box']//div[@class='info-primary']/p/text()"
     ).getall()
     city = job_info[0].strip()
     work_years = job_info[1].strip()
     education = job_info[2].strip()
     company = response.xpath(
         "//div[@class='info-company']/h3[@class='name']/a/text()").get(
         ).strip()
     item = BossItem(title=title,
                     salary=salary,
                     city=city,
                     work_years=work_years,
                     education=education,
                     company=company)
     # yield 给pipelines
     yield item

Beispiel #29

0

Datei anzeigen

Datei: zhipin.py Projekt: qdcxj1993/Scrapy-Redis_Demos

 def parse_job(self, response):
     name = response.xpath("//div[@class='name']/h1/text()").get().strip(
     )  # strip()去掉头和尾的空白字符串
     salary = response.xpath(
         "//div[@class='name']/span/text()").get().strip()
     job_info = response.xpath(
         "//div[@class='job-primary detail-box']/div[@class='info-primary']/p//text()"
     ).getall()
     city = job_info[0]
     work_years = job_info[1]
     education = job_info[2]
     # company = response.xpath("//div[@class='company-info']/a/@title").get().strip()
     company = response.xpath(
         "//div[@class='company-info']/div[@class='info']/text()").get(
         ).strip()
     item = BossItem(name=name,
                     salary=salary,
                     city=city,
                     work_years=work_years,
                     education=education,
                     company=company)
     yield item

Beispiel #30

0

Datei anzeigen

Datei: zhiping.py Projekt: mirrorthink/python

 def parse_job(self, response):
     title = response.xpath('//h1[@class="name"]/text()').get().strip()
     salary = response.xpath(
         '//h1[@class="name"]/span/text()').get().strip()
     job_info = response.xpath(
         '//div[@class="job-primary"]/div[@class="info-primary"]/p//text()'
     ).getall()
     city = job_info[0]
     work_years = job_info[1]
     education = job_info[2]
     company = response.xpath(
         '//div[@class="info-primary"]//a/text()').get()
     item = BossItem(title=title,
                     salary=salary,
                     job_info=job_info,
                     city=city,
                     work_years=work_years,
                     education=education,
                     company=company)
     print(1)
     print(item)
     yield item