Beispiel #1
0
    def parse(self, response):
        def xpath2item(expression, item_key):
            item[item_key] = response.xpath(expression).extract_first()

        item = ZhaopinItem()

        item['url'] = response.url

        node = "//div[@class='top-txt']"
        xpath2item(f"{node}/span[contains(@class, 'title')]/text()",
                   'position')
        xpath2item(f"{node}/span[@class='salary']/text()", 'salary')

        node = "//ul[@class='list-left left']"
        xpath2item(f"{node}//span[@class='area']/../text()", 'area')
        xpath2item(f"{node}//span[@class='xueli']/../text()", 'education')
        xpath2item(f"{node}//span[@class='minge']/../text()", 'experience')
        xpath2item(f"{node}//span[@class='num']/../text()", 'number')

        node = "//div[@class='abs']"
        xpath2item(f"{node}/p[contains(text(), '职位类型')]/text()", 'type_')
        xpath2item(f"{node}/p[contains(text(), '发布时间')]/text()", 'pubdate')
        xpath2item(f"{node}/p[contains(text(), '有效日期')]/text()", 'validate')
        xpath2item(f"{node}/p[@class='duty duty-box']/text()", 'duty')

        node = "//div[@class='details']"
        xpath2item(f"{node}//div[@class='right txt']/p/text()", 'company')
        xpath2item(f"{node}/p[contains(text(), '性质')]/text()", 'co_property')
        xpath2item(f"{node}/p[contains(text(), '规模')]/text()", 'co_scale')
        xpath2item(f"{node}/p[contains(text(), '类型')]/text()", 'co_type')
        xpath2item(f"{node}/p[contains(text(), '工作地点')]/text()", 'address')

        yield item
Beispiel #2
0
    def parse_detail(self, response):
        item = ZhaopinItem()
        # 主题
        item['title'] = response.xpath(
            '//div[@class="title"]/text()').extract_first()
        # 内容链接
        item['url'] = response.url
        # 开始时间
        item['starttime'] = response.xpath(
            '//div[@class="starttime"]/text()').extract_first()
        # 结束时间
        item['endtime'] = response.xpath(
            '//div[@class="endtime"]/text()').extract_first()
        # 招聘城市
        item['cityname'] = response.xpath(
            '//div[@class="cityname"]/text()').extract_first()
        # 详细地址
        item['address'] = response.xpath(
            '//div[@class="address"]/text()').extract_first()
        # 详细内容
        item['content'] = response.xpath(
            '//div[@class="middleLeft"]/p[1]/text()').extract_first()
        # import ipdb as pdb; pdb.set_trace()

        yield item
Beispiel #3
0
    def processingThisPage(self, response):
        print(response.url)
        print(response.xpath("//title/text()"))
        tables = response.xpath(
            '//div[@id="newlist_list_content_table"]/table')

        ##看看能不能设置一个优先设置,去删除某些元素.看看.!!

        for x in tables:
            items = ZhaopinItem()

            #职位名称
            x1 = x.xpath(".//tr[1]/td[1]/div/a[1]//text()").extract()
            items['jobName'] = ''.join(x1)
            #公司名字
            x1 = x.xpath(".//tr[1]/td[3]//text()").extract()
            items['company'] = ''.join(x1)
            #工资待遇
            x1 = x.xpath(".//tr[1]/td[4]//text()").extract()
            items['salary'] = ''.join(x1)
            #工作地点
            x1 = x.xpath(".//tr[1]/td[5]//text()").extract()
            items['location'] = ''.join(x1)
            #企业性质
            x1 = x.xpath(
                ".//tr[2]/td/div/div/ul/li[1]/span[2]//text()").extract()
            items['enterprise'] = ''.join(x1)
            #公司规模
            x1 = x.xpath(
                ".//tr[2]/td/div/div/ul/li[1]/span[3]//text()").extract()
            items['scale'] = ''.join(x1)
            #需要的工作经验
            x1 = x.xpath(
                ".//tr[2]/td/div/div/ul/li[1]/span[4]//text()").extract()
            items['experience'] = ''.join(x1)
            #学历需求
            x1 = x.xpath(
                ".//tr[2]/td/div/div/ul/li[1]/span[5]//text()").extract()
            items['backGroup'] = ''.join(x1)
            #具体职业要求
            x1 = x.xpath(".//tr[2]/td/div/div/ul/li[2]//text()").extract()
            items['require'] = ''.join(x1)

            #相信这里还需要一个函数来处理收集详细的职业要求和公司工作环境需求.!!#公司介绍

            ##测试一下scrapy的request先.
            detailUrl = x.xpath(".//tr[1]/td[1]/div/a[1]/@href").extract()

            if not detailUrl:
                continue

            yield scrapy.Request(detailUrl[0], callback=self.loadDetailPage)
            #items['detail'] = self.manualGetHtml(detailUrl[0])

            #print(self.tracer)
            yield items
Beispiel #4
0
 def parse(self, response):
     item = ZhaopinItem()
     jobs = response.xpath('//table[@class="newlist"]')[1:]
     for job in jobs:
         item['jobname'] = job.xpath(
             './/td[@class="zwmc"]//div/a[1]/text()').extract_first()
         item['companyname'] = job.xpath(
             './/td[@class="gsmc"]/a[1]/text()').extract_first()
         item['salary'] = job.xpath(
             './/td[@class="zwyx"]/text()').extract_first()
         item['workingplace'] = job.xpath(
             './/td[@class="gzdd"]/text()').extract_first()
         item['posttime'] = job.xpath(
             './/td[@class="gxsj"]/span/text()').extract_first()
         yield item
Beispiel #5
0
    def parse_item(self, response):
        item = ZhaopinItem()

        item['name'] = response.xpath(
            '//table[@class="tablelist textl"]//tr[1]/td/text()').extract()
        if item['name']:
            item['name'] = response.xpath(
                '//table[@class="tablelist textl"]//tr[1]/td/text()').extract(
                )[0]
        print item['name']

        item['location'] = response.xpath(
            '//table[@class="tablelist textl"]//tr[2]/td[1]/text()').extract()
        if item['location']:
            item['location'] = response.xpath(
                '//table[@class="tablelist textl"]//tr[2]/td[1]/text()'
            ).extract()[0]

        item['type'] = response.xpath(
            '//table[@class="tablelist textl"]//tr[2]/td[2]/text()').extract()
        if item['type']:
            item['type'] = response.xpath(
                '//table[@class="tablelist textl"]//tr[2]/td[2]/text()'
            ).extract()[0]

        item['num'] = response.xpath(
            '//table[@class="tablelist textl"]//tr[2]/td[3]/text()').extract()
        if item['num']:
            item['num'] = response.xpath(
                '//table[@class="tablelist textl"]//tr[2]/td[3]/text()'
            ).extract()[0][:-1]

        item['zhize'] = response.xpath(
            '//table[@class="tablelist textl"]//tr[3]//li/text()').extract()
        if item['zhize']:
            item['zhize'] = response.xpath(
                '//table[@class="tablelist textl"]//tr[3]//li/text()').extract(
                )[0]

        item['yaoqiu'] = response.xpath(
            '//table[@class="tablelist textl"]//tr[4]//li/text()').extract()
        if item['yaoqiu']:
            item['yaoqiu'] = response.xpath(
                '//table[@class="tablelist textl"]//tr[4]//li/text()').extract(
                )[0]

        item['url'] = response.url
        yield item
Beispiel #6
0
    def processJobDetail(self, response):
        items = ZhaopinItem()
        x = response.xpath("/html/body/div[6]/div[1]")
        # 职位名称
        x1 = x.xpath("/html/body/div[5]/div[1]/div[1]/h1//text()").extract()
        items['jobName'] = ''.join(x1)
        # 公司名字
        x1 = x.xpath("/html/body/div[5]/div[1]/div[1]/h2//text()").extract()
        items['company'] = ''.join(x1)
        # 工资待遇
        x1 = x.xpath("ul/li[1]//text()").extract()
        items['salary'] = ''.join(x1)
        # 工作地点
        x1 = x.xpath("ul/li[2]//text()").extract()
        items['location'] = ''.join(x1)
        # 企业性质
        x1 = x.xpath(
            "/html/body/div[6]/div[2]/div[1]/ul/li[2]//text()").extract()
        items['enterprise'] = ''.join(x1)
        # 公司规模
        x1 = x.xpath(
            "/html/body/div[6]/div[2]/div[1]/ul/li[1]//text()").extract()
        items['scale'] = ''.join(x1)
        # 需要的工作经验
        x1 = x.xpath("ul/li[5]//text()").extract()
        items['experience'] = ''.join(x1)
        # 学历需求
        x1 = x.xpath("ul/li[6]//text()").extract()
        items['backGroup'] = ''.join(x1)
        # 具体职业要求
        #x1 = x.xpath("ul/li[1]//text()").extract()
        #items['require'] = ''.join(x1)

        x1 = x.xpath(
            "/html/body/div[6]/div[1]/div[1]/div/div[1]//text()").extract()
        x2 = ''.join(x1)
        x2 = x2.replace(' ', '')
        x2 = x2.replace('\r\n', '')
        items['detail'] = x2

        ##该页具体的原始url地址
        items['linkUrl'] = response.url

        yield items
Beispiel #7
0
    def parse(self, response):

        quotes = response.css('.sojob-list li')
        for quote in quotes:
            item = ZhaopinItem()
            zhiwu = quote.css('.job-info h3 a::text').extract_first()
            danwei = quote.css('.company-name a::text').extract_first()
            gongzi = quote.css('.text-warning::text').extract_first()
            chengshi = quote.css('.area::text').extract_first()
            link = quote.css('.job-info h3 a::attr(href)').extract_first()
            item['zhiwu'] = zhiwu
            item['danwei'] = danwei
            item['gongzi'] = gongzi
            item['chengshi'] = chengshi
            item['link'] = link
            yield item

        next = response.css('.pagerbar a::attr(href)').extract()
        next_url = 'https://www.liepin.com' + next[7]
        yield scrapy.Request(url=next_url,
                             callback=self.parse,
                             dont_filter=True)
Beispiel #8
0
 def parse(self, response):
     item = ZhaopinItem()
     name = response.xpath("//span[@class='post']/a/text()").extract()
     link = response.xpath("//span[@class='post']/a/@href").extract()
     company = response.xpath(
         "//span[@class='company_name']/a/text()").extract()
     for j in range(0, len(link)):
         data = urllib.request.urlopen(link[j]).read().decode(
             'utf-8', 'ignore')
         pat1 = '<li><span>职位月薪:</span><strong>(.*?)&nbsp;<a'
         money = re.compile(pat1, re.S).findall(data)
         pat2 = '<li><span>工作地点:</span><strong><a target="_blank" href=".*?">(.*?)</a>'
         location = re.compile(pat2, re.S).findall(data)
         item['money'] = money[0]
         item['location'] = location[0]
         item['name'] = name[j]
         item['link'] = link[j]
         item['company'] = company[j]
         yield item
     for i in range(2, 101):
         url = 'http://jobs.zhaopin.com/guangzhou/p' + str(i)
         print('正在爬取第' + str(i) + '页')
         yield Request(url=url, callback=self.parse, headers=self.headers)
Beispiel #9
0
 def parse_job(self, response):
     item = ZhaopinItem()
     item['job_url'] = response.url
     item['job_title'] = response.xpath(
         "//div[contains(@class,'inner-left fl')]/h1/text()").extract_first(
         )
     item['company'] = response.xpath(
         "//div[contains(@class,'inner-left fl')]/h2//text()"
     ).extract_first().strip()
     item['salary'] = response.xpath(
         "//div[contains(@class,'terminalpage-left')]/ul/li[1]/strong/text()"
     ).extract_first().strip()
     item['location'] = "".join(
         response.xpath(
             "//div[contains(@class,'terminalpage-left')]/ul/li[2]/strong//text()"
         ).extract())
     item['post_time'] = response.xpath(
         "//div[contains(@class,'terminalpage-left')]/ul/li[3]/strong//text()"
     ).extract_first()
     item['type_of_empl'] = response.xpath(
         "//div[contains(@class,'terminalpage-left')]/ul/li[4]/strong/text()"
     ).extract_first()
     item['work_exp'] = response.xpath(
         "//div[contains(@class,'terminalpage-left')]/ul/li[5]/strong/text()"
     ).extract_first()
     item['min_edu_qual'] = response.xpath(
         "//div[contains(@class,'terminalpage-left')]/ul/li[6]/strong/text()"
     ).extract_first()
     item['num_of_ppl'] = response.xpath(
         "//div[contains(@class,'terminalpage-left')]/ul/li[7]/strong/text()"
     ).extract_first().strip()
     item['occup_type'] = response.xpath(
         "//div[contains(@class,'terminalpage-left')]/ul/li[8]/strong//text()"
     ).extract_first()
     item['job_desc'] = self.parse_job_desc(response)
     item['co_profile'] = self.parse_co_profile(response)
     yield item
Beispiel #10
0
    def parse(self, response):  #判断列表页是否获取成功,成功则返回值,失败则报异常
        pn = response.meta['pn']
        referer = response.meta['referer']
        # print(response.text)
        isSuccess = json.loads(response.text)['success']
        if isSuccess == True:
            pgNo = json.loads(response.text)['content']['pageNo']
            if pgNo != 0:
                print(f'第{pn}页页面信息获取成功!')
            else:
                print(f'\033[1;31m ***Warning:第{pn}页页面信息获取失败!*** \033[0m')
                print(f'\033[1;31m {response.text} \033[0m')
                print(
                    '{0}可能的错误:\n1.district对应的bizArea不一致。\n2.没有更多页面!{1}'.format(
                        '\033[1;31m', '\033[0m'))
                print('\033[1;31m {0} \033[0m \n'.format(30 * '*'))

            pn = response.meta['pn']
            kd = response.meta['kd']
            recruitInfo = json.loads(response.text)['content']['hrInfoMap']
            for zhaopinId, comInfo in recruitInfo.items():
                item = ZhaopinItem()
                item['keyWord'] = kd
                item['zhaopinId'] = zhaopinId
                item['userId'] = comInfo['userId']
                item['phone'] = comInfo['phone']
                if comInfo['positionName'] != '' and comInfo[
                        'positionName'] != None:
                    item['positionName'] = comInfo['positionName'].replace(
                        '&amp;', '&').strip()
                else:
                    item['positionName'] = None
                item['receiveEmail'] = comInfo['receiveEmail']
                item['realName'] = comInfo['realName']
                if comInfo['portrait'] is not None:
                    item[
                        'portrait'] = 'https://www.lgstatic.com/thumbnail_300x300/' + comInfo[
                            'portrait']
                else:
                    item['portrait'] = None
                item['userLevel'] = comInfo['userLevel']
                item['canTalk'] = comInfo['canTalk']
                pageUrl = f'https://www.lagou.com/jobs/{zhaopinId}.html'
                hdsd = {
                    'Host': 'www.lagou.com',
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0',
                    'Accept': 'application/json, text/javascript, */*; q=0.01',
                    'Accept-Language':
                    'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
                    'Referer': referer,
                    'Connection': 'keep-alive'
                }
                meta = {'item': item, 'pn': pn}
                yield Request(url=pageUrl,
                              callback=self.detail_parse,
                              cookies=self.cookie,
                              dont_filter=True,
                              headers=hdsd,
                              meta=meta)
        else:
            msg = print("\033[1;31m{0}\n+访问失败:{1}!+\n{0}\033[0m".format(
                32 * '+',
                json.loads(response.text)['msg']))
            return msg
Beispiel #11
0
    def parse(self, response):
        # notfound = response.xpath("//div[@class='returnpage']/h1/text()")
        # print("notfound", notfound)
        if response.status != 404:
            detailobjs = response.xpath(
                "//div[@class='details_container bg_container ']")
            i = 0
            for detailobj in detailobjs:
                item = ZhaopinItem()
                zhiwei = detailobj.xpath(
                    "//span[@class='post']/a/text()")[i].extract()
                zhiwei_url = detailobj.xpath(
                    "//span[@class='post']//a/@href")[i].extract()
                gongsi = detailobj.xpath(
                    "//span[@class='company_name']/a/text()")[i].extract()
                gongsi_url = detailobj.xpath(
                    "//span[@class='company_name']/a/@href")[i].extract()
                yuexin = detailobj.xpath(
                    "//span[@class='salary']/text()")[i].extract()
                gongzuodidian = detailobj.xpath(
                    "//span[@class='address']/text()")[i].extract()
                faburiqi = detailobj.xpath(
                    "//span[@class='release_time']/text()")[i].extract()
                gongzuojingyan = detailobj.xpath(
                    "//div[@class='fleft detail_items']/span[1]/text()"
                )[0].extract()
                gongzuojingyan = gongzuojingyan.split(":")[1]
                xueli = detailobj.xpath(
                    "//div[@class='fleft detail_items']/span[2]/text()"
                )[0].extract()
                xueli = xueli.split(":")[1]
                gongsiguimo = detailobj.xpath(
                    "//div[@class='fleft detail_items']/span[3]/text()"
                )[0].extract()
                gongsiguimo = gongsiguimo.split(":")[1]
                gongsixingzhi = detailobj.xpath(
                    "//div[@class='fleft detail_items']/span[4]/text()"
                )[0].extract()
                gongsixingzhi = gongsixingzhi.split(":")[1]

                item['zhiwei'] = zhiwei
                item['zhiwei_url'] = zhiwei_url
                item['gongsi'] = gongsi
                item['gongsi_url'] = gongsi_url
                item['yuexin'] = yuexin
                item['gongzuodidian'] = gongzuodidian
                item['faburiqi'] = faburiqi
                item['gongzuojingyan'] = gongzuojingyan
                item['xueli'] = xueli
                item['gongsiguimo'] = gongsiguimo
                item['gongsixingzhi'] = gongsixingzhi
                item['location'] = self.location_list[self.cnt]
                i += 1
                yield item
            self.num += 1
            url = self.base_url.format(num=self.num,
                                       location=self.location_list[self.cnt])
            yield scrapy.Request(url, callback=self.parse)
        else:
            print(
                "==============================================================================="
            )
            if self.cnt < len(self.location_list) - 1:
                # time.sleep(random.uniform(1, 5))
                self.num = 1
                self.cnt += 1
                url = self.base_url.format(
                    num=self.num, location=self.location_list[self.cnt])
                yield scrapy.Request(url, callback=self.parse)