Esempio n. 1
0
    def parse_item(self, response):

        item = Job51Item()

        city_name = self.city
        jobname = response.xpath('//div[@class="cn"]/h1/@title')[0].extract()
        position = response.xpath('//div[@class="cn"]/p/@title')[0].extract(
        ).split('|')[0].strip('\xa0\xa0')
        workingExp = response.xpath('//div[@class="cn"]/p/@title')[0].extract(
        ).split('|')[1].strip('\xa0\xa0')
        eduLevel = response.xpath('//div[@class="cn"]/p/@title')[0].extract(
        ).split('|')[2].strip('\xa0\xa0')
        salary = response.xpath(
            '//div[@class="cn"]/strong/text()')[0].extract()
        company_name = response.xpath(
            '//div[@class="cn"]/p/a/@title')[0].extract()
        update_time = response.xpath('//div[@class="cn"]/p/@title')[0].extract(
        ).split('|')[4].strip('\xa0\xa0')
        require = response.xpath(
            '//div[@class="bmsg job_msg inbox"]/p/text()').extract()
        for i in require:
            self.str += i[2:]
        job_require = self.str

        sha1 = hashlib.sha1()
        string = (company_name + '' + update_time)
        stri = string.encode('utf8')
        sha1.update(stri)
        hash_id = sha1.hexdigest()

        for field in item.fields.keys():
            item[field] = eval(field)
        yield item
Esempio n. 2
0
    def parse(self, response):
        nodes = response.xpath('//div[@class="dw_table"]/div[@class="el"]')
        for node in nodes:
            item = Job51Item()
            jobname = node.xpath('./p//a/@title').extract_first()
            company = node.xpath(
                './span[@class="t2"]/a/text()').extract_first()
            location = node.xpath('./span[@class="t3"]/text()').extract_first()
            salary = node.xpath('./span[@class="t4"]/text()').extract_first()

            item['jobname'] = jobname
            item['company'] = company
            item['location'] = location
            item['salary'] = salary
            yield item

        next_url = response.xpath('//li[@class="bk"]/a/@href').extract()
        self.page += 1
        print("51job page:" + str(self.page))
        time.sleep(3)
        if next_url:
            url = response.urljoin(next_url[-1])
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 dont_filter=True)
        else:
            print("退出")
Esempio n. 3
0
    def detailParse(self,response):
        item = Job51Item()
        print("可以获取详情了")
        # print(response)
        item['company'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/@title').extract_first()
        if not item['company']:
            item['company'] = "暂无"
        item['workname'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/@title').extract_first()
        if not item['workname']:
            item['workname'] = "暂无"
        item['salary'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/strong/text()').extract_first()
        if  not item['salary']:
            item['salary']="暂无"
        # print(item['salary'])
        item['add'] = response.xpath('/html/body/div[3]/div[2]/div[3]/div[2]/div/p/text()').extract_first()
        if  not item['add']:
            item['add']="暂无"

        item['workdetail'] = "".join(response.xpath('/html/body/div[3]/div[2]/div[3]/div[1]/div/p/text()').extract())
        if not item['workdetail']:
            item['workdetail'] = "暂无"
        item['require'] = "".join(response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/text()').extract())
        if not item['require']:
            item['require'] = "暂无"
        yield item
Esempio n. 4
0
    def parse_item(self, response):
        #self.log('Hi, this is an item page! %s' % response.url)
        #print(response.body)

        item = Job51Item()
        item['company_url'] = response.url
        # item['company_name']=response.xpath('//div[@class="in "]/title/text()').extract[0]
        #item['company_name']=response.xpath('/html/head/title/text()').extract[0] 报错
        #item['company_name']=response.xpath('/html/head/title/text()')  正确
        #item['company_name']=response.xpath('//div[@class="in "]/h1/@title')
        #正确应用text()与extract()
        cname = response.xpath(
            '//div[@class="in "]/h1/@title | //div[@class="in img_on"]/h1/@title'
        )
        #item['company_name']=cname[0].extract()  与下面语句效果相同
        item['company_name'] = cname.extract()[0]
        caddress = response.xpath('//p[@class="fp"]/text()').extract()
        cadstr = "".join(caddress)
        item['company_address'] = cadstr.strip().replace(" ", "")
        cinfolist = response.xpath('//div[@class="in"]/p/text()').extract()
        cinfostr = "".join(cinfolist)
        #如果需要换行,或者需要正常在网页上显示,去掉下面的后处理过程
        cinfo = cinfostr.strip().replace(" ", "").replace('\xa0', '')
        item['company_info'] = cinfo

        yield item
Esempio n. 5
0
    def joblist(self, response):
        #nextlink = response.css('li[class=bk]').extract()[-1]
        jobs = response.xpath("//div[@id='resultList']/div")
        for job in jobs:
            try:
                item = Job51Item()
                item['jobname'] = job.xpath(
                    './p[@class="t1 "]//a/@title').extract()[0]
                item['joblink'] = job.xpath(
                    './p[@class="t1 "]//a/@href').extract()[0]
                item['company'] = job.xpath(
                    './span[@class="t2"]//a/@title').extract()[0]
                item['place'] = job.xpath(
                    './span[@class="t3"]/text()').extract()[0]
                try:
                    item['salary'] = job.xpath(
                        './span[@class="t4"]/text()').extract()[0]
                except:
                    item['salary'] = '面谈'

                yield scrapy.Request(item['joblink'],
                                     callback=self.jobinfo,
                                     meta={'item': item})
            except:
                print('joblist error', response.url)
Esempio n. 6
0
    def parse_item(self, response):
        l = ItemLoader(item=Job51Item(),response=response)
        l.add_value('Job_url',response.url)
        l.add_xpath('Job_name',xpath='//div[@class="cn"]/h1/@title')
        l.add_xpath('Job_location','//span[@class="lname"]/text()')
        l.add_xpath('Job_salary','//div[@class="cn"]/strong/text()')
        l.add_xpath('Company_name','//p[@class="cname"]/a/@title')
        l.add_xpath('Company_type','//p[@class="msg ltype"]/text()')#item pipeline[0]
        l.add_xpath('Require_exp','//div[@class="jtag inbox"]//span[@class="sp4"]/text()')#pipeline
        l.add_xpath('Location','//p[@class="fp"]/text()')

        return l.load_item()
Esempio n. 7
0
 def parse(self, response):
     item = Job51Item()
     datas = response.xpath('//div[@class="el"]/p')
     for data in datas:
         item['url'] = data.xpath('.//a/@href').extract_first()
         yield scrapy.Request(url=item['url'],
                              meta={'item': item},
                              callback=self.get_info)
     text = response.xpath(
         '//div[@class="p_in"]//span[@class="td"]/text()').extract_first()
     maxnum = int(re.findall('(\d+)', text)[0])
     for i in range(2, maxnum + 1):
         next_url = self.base_url.format(key=key, page=i)
         yield scrapy.Request(url=next_url, callback=self.parse)
Esempio n. 8
0
 def parse(self, response):
     node_list = response.xpath('//*[@id="resultList"]/div[@class="el"]')
     next_page = response.xpath('//div[@class="p_in"]/ul/li/a/@href').extract()
     for node in node_list:
         item = Job51Item()
         detail_link = node.xpath('./p/span/a/@href').extract_first()
         item['position_name'] = node.xpath('./p/span/a/@title').extract_first()
         item['company'] = node.xpath('./span[1]/a/@title').extract_first()
         item['work_address'] = node.xpath('./span[2]/text()').extract_first()
         item['salary'] = node.xpath('./span[3]/text()').extract_first()
         item['publishtime'] = node.xpath('./span[4]/text()').extract_first()
         yield scrapy.Request(url=detail_link, callback=self.parse_detail, meta={
             'item': item,
         })
     for url in next_page:
         yield scrapy.Request(url=url, callback=self.parse)
Esempio n. 9
0
    def parseContent(self, response):

        soup = BeautifulSoup(response.text, 'lxml')
        content = soup.find('div', class_='tHjob').find('div', class_='cn')
        item = Job51Item()
        item['zwmc'] = content.find('h1')['title'].encode('utf-8')
        item['gzdd'] = content.find('span',
                                    class_='lname').get_text().encode('utf-8')
        item['gzxz'] = content.find('strong').get_text().encode('utf-8')
        item['gsmc'] = content.find(
            'p', class_='cname').find('a')['title'].encode('utf-8')
        item['gslx'] = content.find(
            'p', class_='ltype').get_text().encode('utf-8').split('|')[0]

        content1 = soup.find('div', class_='tCompany_main').find(
            'div', class_='jtag').find('div', class_='t1').find_all('span')
        for c in content1:
            if c.find('em', class_='i1') != None:
                item['gzjy'] = c.get_text().encode('utf-8')
                break
            item['gzjy'] = ''
        for c in content1:
            if c.find('em', class_='i2') != None:
                item['zdxl'] = c.get_text().encode('utf-8')
                break
            item['zdxl'] = ''
        for c in content1:
            if c.find('em', class_='i3') != None:
                item['zprs'] = c.get_text()
                break
            item['zprs'] = ''
        for c in content1:
            if c.find('em', class_='i4') != None:
                fbsj_temp = c.get_text().encode('utf-8')
                item['fbsj'] = fbsj_temp[:fbsj_temp.find('发布')].encode('utf-8')
                break
            item['fbsj'] = ''

        content2 = soup.find('div', class_='tCompany_main').find('p',
                                                                 class_='t2')
        fldy = ''
        if content2 != None:
            spans = content2.find_all('span')
            for c in spans:
                fldy += (c.get_text() + ','.encode('utf-8'))
        item['fldy'] = fldy
        return item
Esempio n. 10
0
    def process_item(self, item, spider):
        data = Job51Item(item)
        # 去除每个属性数据中的 \t字符,再用 \t字符作为分隔符
        # 在hadoop中就以 \t为分隔符来切割记录
        # 由于dict中各个字段的顺序不固定,所以采用这种方式
        output_data = (data["title"] + "\t" + data["salary"] + "\t" +
                       data["place"] + "\t" + data["experience"] + "\t" +
                       data["education"] + "\t" + data["need_persons"] + "\t" +
                       data["publish_date"] + "\t" +
                       re.sub(r"\s+", " ", data["need_skill"]).strip()) + "\n"

        # 通过title属性,把记录写入相应的文件
        now_day = str(datetime.datetime.now().date())
        file_path = "D:/Code/GraduationProject/files/" + now_day + "_" + str(
            data["title"]) + ".txt"
        with open(file_path, "a", encoding="utf-8") as f:
            f.write(output_data)
Esempio n. 11
0
 def parse(self, response):
     job_json = re.findall(r"window.__SEARCH_RESULT__ = ([\s\S]*?)</script", response.text, re.S)
     if len(job_json) <= 0:
         return
     job_json = json.loads(job_json[0])
     for each_job in job_json['engine_search_result']:
         item = Job51Item()
         item['job_name'] = each_job['job_name']
         item['salary'] = each_job['providesalary_text']
         item['update_date'] = each_job['updatedate']
         item['company_name'] = each_job['company_name']
         item['company_type'] = each_job['companyind_text']
         item['work_address'] = each_job['workarea_text']
         item['company_size'] = each_job['companytype_text']
         item['welfare'] = ";".join(each_job['jobwelf_list'])
         item['job_href'] = each_job['job_href']
         yield Request(url=each_job['job_href'], callback=self.parse_detail, meta={'item': item})
Esempio n. 12
0
 def real_data(self, response):
     item = Job51Item()
     item['url'] = response.url
     item['title'] = response.xpath('//h1/@title').extract_first()
     # print(item['title'])
     item['location'] = response.xpath('//div[@class="cn"]/p[2]/text()[1]').extract_first().strip()
     item['company_name'] = response.xpath('//div[@class="cn"]/p/a[1]/text()').extract_first().strip()
     item['salary'] = response.xpath('//div[@class="cn"]/strong/text()').extract_first()
     item['company_info'] = response.xpath('//div[@class="com_tag"]/p/text()').extract()
     item['experience'] = response.xpath('//div[@class="cn"]/p[2]/text()[2]').extract_first().strip()
     job_info = response.xpath(
         '//div[@class="bmsg job_msg inbox"]/p/text()|//div[@class="bmsg job_msg inbox"]/text()').extract()
     item['job_info'] = "".join(job_info).strip()
     address = response.xpath('//div[@class="bmsg inbox"]/p[@class="fp"]/text()').extract()
     item['address'] = "".join(address).replace('\r','').replace('\n','').replace('\t','')
     # print(item['address'])
     yield item
Esempio n. 13
0
 def parse_job_info(self, response):
     item = Job51Item()
     for echo in response.xpath('//div[@class="el"]')[4:]:
         item["position_name"] = echo.xpath(
             './p/span/a/@title').extract_first()
         item["company"] = echo.xpath(
             './span[@class="t2"]/a/text()').extract_first()
         item["address"] = echo.xpath(
             './span[@class="t3"]/text()').extract_first()
         item["salary"] = echo.xpath(
             './span[@class="t4"]/text()').extract_first()
         item["time"] = echo.xpath(
             './span[@class="t5"]/text()').extract_first()
         yield item
     yield scrapy.Request(url=response.url,
                          callback=self.parse,
                          meta={},
                          dont_filter=True)
Esempio n. 14
0
 def parse_url(self, responses):
     response = Selector(responses)
     head = response.xpath(r'.//div[@class="cn"]')
     no = Parse_ele(head)
     # 标题用搜索关键字替代
     # title = no.xpath_no(r'./h1/@title')
     title = responses.meta["search_key"]
     salary = no.xpath_no(r'./strong/text()')
     salary = self.changeSalary(salary)
     need = no.xpath_no(r'./p[contains(class, msg)]/@title')
     needs = str(need).split('|')
     try:
         place = needs[0].split('-')[0].strip()
         education = "缺失"
         experience = "缺失"
         need_persons = "缺失"
         publish_date = "缺失"
         for n in needs[1:]:
             if "经验" in n:
                 experience = n.strip()
             elif "人" in n:
                 need_persons = n.strip()
             elif "发布" in n:
                 publish_date = n.strip()
             else:
                 education = n.strip()
         need_skill = response.xpath(
             r'.//div[@class="bmsg job_msg inbox"]//text()').extract()
     except Exception as e:
         print("信息获取有误!", e, response.response.url, sep=",")
     needs_skill = "".join([x for x in need_skill if x.strip() != ''])
     item = Job51Item(title=title,
                      salary=salary,
                      place=place,
                      experience=experience,
                      education=education,
                      need_persons=need_persons,
                      publish_date=publish_date,
                      need_skill=needs_skill)
     yield item
Esempio n. 15
0
 def parse(self, response):
     #抓取数据
     htmlstr = response.body.decode('GBK')
     #正则表达式
     reg = re.compile(
         r'class="t1.*?title="(.*?)" href="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*? <span class="t5">(.*?)</span>',
         re.S)
     items = re.findall(reg, htmlstr)
     for x in items:
         item = Job51Item()
         item['jobname'] = x[0]
         item['companyname'] = x[2]
         item['jobadd'] = x[3]
         item['jobsalary'] = x[4]
         for y in self.getjobcontent(x[1]):
             item['jobcontent'] = re.sub(r'<.*?>|&nbsp|\t|\n|\r|\s;', '',
                                         y[3]).replace(r"\t", "").replace(
                                             r"\r", "")
             item['jobexperience'] = y[0]
             item['education'] = y[1]
             item['peoplepnumber'] = y[2]
             yield item
Esempio n. 16
0
 def parse(self, response):
     """首页解析的方法"""
     # 1.爬取起始页的数据
     item = Job51Item()
     job_list = response.xpath(
         "//div[@id='pageContent']/div[@class='items']/a")
     for job_one in job_list:
         # 职位信息详情url
         item["jobHref"] = job_one.xpath("./@href").extract_first()
         # 工作地点
         item["jobErea"] = job_one.xpath("./i/text()").extract_first()
         # 公司名称
         item["jobCompany"] = job_one.xpath(
             "./aside/text()").extract_first()
         # 薪资待遇
         item["jobSalary"] = job_one.xpath("./em/text()").extract_first()
         # ===============================日志记录==================================
         # logging.warning("item::::::: %s" %item)
         yield scrapy.Request(
             # 职位详情url
             item["jobHref"],
             # 回调函数
             callback=self.parse_detail,
             meta={"item": item})
     # 2.爬取下一页的数据
     next_url = response.xpath(
         "//div[@id='pageContent']/form[@id='turnpage']/div[@class='paging']/a[@class='next']/@href"
     ).extract_first()
     # 如果存在下一页就需要继续爬取
     # javascript:void(0);
     if not next_url.find("javascript") >= 0:
         yield scrapy.Request(
             # 下一页
             next_url,
             # 回调函数
             callback=self.parse)
     yield item
Esempio n. 17
0
 def detail_parse(self, response):
     #判断信息是否存在
     ifexists = lambda x: x[0] if x else ''
     job = Job51Item()
     #职位名称
     job['name'] = response.xpath(
         '//div[@class="tHeader tHjob"]//h1//text()').extract()[0]
     #公司名称
     job['co_name'] = response.xpath(
         '//p[@class="cname"]/a//text()').extract()[0]
     #区域
     job['area'] = response.xpath(
         '//div[@class="tHeader tHjob"]//span/text()').extract()[0]
     #工资
     job['salary'] = ifexists(
         response.xpath(
             '//div[@class="tHeader tHjob"]//strong/text()').extract())
     #所有要求
     #其他要求
     otherq = ''
     all_require = response.xpath(
         '//div[@class="tBorderTop_box bt"]//div[@class="t1"]/span/text()'
     ).extract()
     for require in all_require:
         if '经验'.decode('utf8') in require:
             job['exp'] = require
         elif require in self.edu_type:
             job['edu'] = require
         elif '人'.decode('utf8') in require:
             job['num'] = require
         elif '发布'.decode('utf8') in require:
             job['time'] = require
         else:
             otherq = otherq + require + ' '
     job['otherq'] = otherq
     #福利
     welfare = ' '
     fuli = response.xpath(
         '//div[@class="tBorderTop_box bt"]//p[@class="t2"]/span/text()'
     ).extract()
     for f in fuli:
         welfare = welfare + f + ' '
     job['welfare'] = welfare
     #职位信息
     posi_info = response.xpath(
         '//div[@class="tBorderTop_box"][1]//div[@class="bmsg job_msg inbox"]//text()'
     ).extract()
     for i in posi_info:
         if i in self.unrequire:
             posi_info.remove(i)
         else:
             i.strip()
     job['info'] = ' '.join(posi_info)
     #上班地址
     job['local'] = ifexists(
         response.xpath(
             '//div[@class="tBorderTop_box"]/div[@class="bmsg inbox"]//p/text()[2]'
         ).extract())
     #公司网址
     job['co_url'] = response.xpath(
         '//div[@class="tHeader tHjob"]//p[@class="cname"]/a/@href'
     ).extract()[0]
     #公司类型
     str1 = response.xpath(
         '//div[@class="tHeader tHjob"]//p[@class="msg ltype"]/text()'
     ).extract()[0]
     strtotal = ''
     strlist = str1.split('|')
     for s in strlist:
         strtotal = strtotal + s.strip() + '|'
     job['co_type'] = strtotal
     yield job