Esempio n. 1
0
 def parse(self, response):
     """
     处理列表页数据
     :param response: 下载器response对象==>中央引擎==>spider
     :return:
     """
     # 先拿到正确的div列表 ==> 可以使用CSS样式选择器
     div_list = response.xpath('//div[@class="el"]')[4:]
     for div in div_list:
         item = JobItem()
         item["name"] = div.xpath('./p/span/a/text()').extract_first().strip()
         item["job_url"] = div.xpath('./p/span/a/@href').extract_first()
         item["company_name"] = div.xpath('./span[1]/a/text()').extract_first()
         item["location"] = div.xpath('./span[2]/text()').extract_first()
         item["salary"] = div.xpath('./span[3]/text()').extract_first()
         item["release_date"] = div.xpath('./span[4]/text()').extract_first()
         yield scrapy.Request(
             url=item["job_url"],
             callback=self.parse_detail,
             meta={"item": item},
             dont_filter=True,
         )
         # break  # 测试
     yield scrapy.Request(
         url=response.xpath('//a[text()="下一页"]/@href').extract_first()
     )
Esempio n. 2
0
    def parse_job(self, response):

        item = JobItem()

        item['LINK'] = response.url

        item['TITLE'] = response.xpath(
            '//h1[@class="job_title"]/text()').extract_first()
        if not item['TITLE']:
            return
        item['TITLE'] = response.xpath(
            '//h1[@class="job_title"]/text()').extract_first().replace(
                '\n', ' ')

        item['SALARY'] = response.xpath(
            '//div[@class="salary"]/span[@class="salary-text"]/text()'
        ).extract_first()

        item['COMPANY'] = response.xpath(
            '//h3[@class="name"]/a/text()').extract_first()
        if not item['COMPANY']:
            return

        item['ADDRESS'] = response.xpath(
            '//div[@class="address__full-address"]/span/text()').extract_first(
            ).rsplit(',')[-1].strip()
        if not item['ADDRESS']:
            return

        skill = response.xpath(
            '//div[@class="tag-list"]/a[@class="big ilabel mkt-track"]/span/text()'
        ).extract()
        item['SKILL'] = ''.join(skill).replace('\n', ' ').split()
        if not item['SKILL']:
            return

        item['TYPE'] = response.xpath(
            '//p[@class="gear-icon"]/text()').extract_first().replace(
                '\n', ' ')
        if not item['TYPE']:
            return

        req = response.xpath(
            '//div[@class="experience"]/ul/li/text()').extract()
        if not req:
            return
        for text in req:
            year = re.findall(
                r"(\d.*) (?:years of experience|years of|năm kinh nghiệm|of experience)",
                text)
            if year:
                if year:
                    item['EXP'] = year

            degree = re.findall(r'(?:Degree in|degree in|Tốt nghiệp) (.*)',
                                text)
            if degree:
                item['DEGREE'] = degree[0]

        yield item
Esempio n. 3
0
File: hh.py Progetto: kwaket/parsers
    def parse_item(self, response):
        item = JobItem()

        content = response.xpath(
            '(//div[@class="main-content"]//div[contains(@class, "bloko-column_container")])[1]'
        )
        vacancy_section = content.xpath(
            '(//div[@class="vacancy-description"]/div[@class="vacancy-section"])[1]/div[1]'
        )

        item['name'] = content.xpath(
            './/div[contains(@class, "vacancy-title")]/h1//text()').get()
        item['salary'] = content.xpath(
            './/p[@class="vacancy-salary"]//*/text()').getall()
        item['company'] = content.xpath(
            './/a[@data-qa="vacancy-company-name"]//*/text()').getall()
        item['address'] = content.xpath(
            './/p[@data-qa="vacancy-view-location"]//text()').getall()
        item['experience'] = content.xpath(
            './/*[@data-qa="vacancy-experience"]//text()').getall()
        item['employment_mode'] = content.xpath(
            './/*[@data-qa="vacancy-view-employment-mode"]//text()').getall()
        item['skills'] = content.xpath(
            './/*[contains(@data-qa, "skills-element")]/span/text()').getall()
        item['description'] = vacancy_section.get()
        item['url'] = response.request.url

        yield item
Esempio n. 4
0
 def parse_page(self, response):
     item = JobItem()
     profession = response.xpath(
         '/html/body/div[3]/div[3]/div[1]/span/text()').extract()
     description = response.xpath(
         '/html/body/div[3]/div[3]/div[2]/div[1]/div[1]/div[1]/text()'
     ).extract()
     requirement = response.xpath(
         '/html/body/div[3]/div[3]/div[1]/div[4]/text()').extract()
     companyName = response.xpath(
         '/html/body/div[3]/div[4]/div[1]/div/div[1]/div[1]/a/text()'
     ).extract()
     companyDescription = response.xpath(
         '/html/body/div[3]/div[3]/div[2]/div[2]/div[1]/div/div/div/p/text()'
     ).extract()
     scale = response.xpath(
         '/html/body/div[3]/div[4]/div[1]/div/p[2]/text()').extract()
     companyCategory = response.xpath(
         '/html/body/div[3]/div[4]/div[1]/div/p[1]/a/text()').extract()
     item['profession'] = profession
     item['description'] = description
     item['requirement'] = requirement
     item['companyName'] = companyName
     item['companyDescription'] = companyDescription
     item['scale'] = scale
     item['companyCategory'] = companyCategory
     yield item
Esempio n. 5
0
 def parse_item(self, response):
     item = JobItem()
     item['type'] = "1"
     item['ename'] = str(
         response.xpath(
             '/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/@title').
         extract()[0])
     item['postinfo'] = str(
         response.xpath(
             '/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a/@title').
         extract()[0])
     #这个字段对应的标签在有的Html 标签中也是不对的 只是需要点时间去查找一下作下处理就行了
     item['salary'] = str(
         response.xpath(
             '/html/body/div[3]/div[2]/div[2]/div/div[1]/strong/text()').
         extract()[0])
     item['oldurl'] = response.url
     #51job上有的详情页中没有学历限制,所以只有三个<span>标签 这样有时会造成角标越界错误
     #个别爬取的信息错误也是由于标签问题导致
     try:
         item['releasetime'] = str(
             response.xpath(
                 '/html/body/div[3]/div[2]/div[3]/div[1]/div/div/span[4]/text()'
             ).extract()[0])
     except IndexError as e:
         item['releasetime'] = str(
             response.xpath(
                 '/html/body/div[3]/div[2]/div[3]/div[1]/div/div/span[3]/text()'
             ).extract()[0])
     return item
Esempio n. 6
0
    def parse(self, response):
        node_list = response.xpath("//div[@class='el' and not(@id)]")

        for node in node_list:
            item = JobItem()

            item["position_name"] = node.xpath(
                "./p[@class='t1 ']/span/a/@title").extract_first()
            item["position_link"] = node.xpath(
                "./p[@class='t1 ']/span/a/@href").extract_first()
            item["company_name"] = node.xpath(
                "./span[@class='t2']/a/@title").extract_first()
            item["postiton_pay"] = node.xpath(
                "./span[@class='t4']/text()").extract_first()
            item["work_location"] = node.xpath(
                "./span[@class='t3']/text()").extract_first()
            item["publish_times"] = node.xpath(
                "./span[@class='t5']/text()").extract_first()
            yield item
            # item 数据交给管道处理(需要自己实现)
            # Request数据交给调度器处理(框架已经实现)
            logging.info(item['position_link'])
            logging.info(type(item['position_link']))
            yield scrapy.Request(item['position_link'],
                                 callback=self.parse_page)
Esempio n. 7
0
 def saveData(self, jobdict):
     """
       使用items以及pipeline实现爬虫数据持久化
       :params: dict 爬取到的一条信息
     """
     item = JobItem()
     item = jobdict
     yield item
Esempio n. 8
0
    def parse_item(self, response):
        item = JobItem()

        content = response.xpath(
            '(//div[@class="main-content"]//div[contains(@class, "bloko-column_container")])[1]'
        )
        vacancy_section = content.xpath(
            '(//div[@class="vacancy-description"]/div[@class="vacancy-section"])[1]/div[1]'
        )

        item['name'] = content.xpath(
            './/div[contains(@class, "vacancy-title")]/h1//text()').get()

        if len(
                content.xpath(
                    './/p[@class="vacancy-salary"]//*/text()').getall()) > 1:
            item['salaryMIN'] = int(
                (content.xpath('.//p[@class="vacancy-salary"]//*/text()').
                 getall()[1]).replace('\xa0', ''))
            if len(
                    content.xpath('.//p[@class="vacancy-salary"]//*/text()').
                    getall()) > 2:
                if content.xpath('.//p[@class="vacancy-salary"]//*/text()'
                                 ).getall()[2] == ' до ':
                    item['salaryMAX'] = int((
                        content.xpath('.//p[@class="vacancy-salary"]//*/text()'
                                      ).getall()[3]).replace('\xa0', ''))

        xpath_t = './p/strong[contains(text(), "{}")]/ancestor::p/following::ul[1]/li/text()'
        item['duty'] = vacancy_section.xpath(
            xpath_t.format('Обязанности')).getall()
        item['requirements'] = vacancy_section.xpath(
            xpath_t.format('Требования')).getall()
        item['conditions'] = vacancy_section.xpath(
            xpath_t.format('Условия')).getall()

        item['company'] = content.xpath(
            './/a[@data-qa="vacancy-company-name"]//*/text()').getall()[-1]
        item['address'] = content.xpath(
            './/p[@data-qa="vacancy-view-location"]//text()').extract_first()
        item['experience'] = content.xpath(
            './/*[@data-qa="vacancy-experience"]//text()').getall()
        item['type_of_employment'] = content.xpath(
            './/*[@data-qa="vacancy-view-employment-mode"]//text()').getall(
            )[0]
        item['schedule'] = content.xpath(
            './/*[@data-qa="vacancy-view-employment-mode"]//text()').getall(
            )[-1]
        item['date'] = content.xpath(
            './/*[@class="vacancy-creation-time"]//text()').getall()[1]
        item['skills'] = content.xpath(
            './/*[contains(@data-qa, "skills-element")]/span/text()').getall()
        item['description'] = vacancy_section.get() or ''
        item['url'] = response.request.url

        yield item
Esempio n. 9
0
    def parse(self, response):

        jobs = response.css('.content')
        for job in jobs:
            item = JobItem()
            item['designation'] = job.css('.desig::text').extract()[0]
            item['company'] = job.css('.org::text').extract()[0]
            item['skill'] = job.css('.skill::text').extract()[0]
            item['jobDescription'] = job.css('.desc::text').extract()[0]
            yield item
Esempio n. 10
0
    def parse(self, response):
        item = JobItem()
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        # 全部页数
        allpage = int(
            re.match('共(\d+)页,到第',
                     soup.find(attrs={
                         'class': 'td'
                     }).string).group(1))
        # 职位结果
        result = soup.find_all(attrs={'class': 'el'})[16:]
        print(result)
        for message in result:
            job_soup = BeautifulSoup(str(message), 'html.parser')
            if '-' in str(job_soup.find(attrs={'class': 't3'}).string):
                split = str(job_soup.find(attrs={
                    'class': 't3'
                }).string).split('-')
                item['city'] = split[0]  #城市
                item['address'] = split[1]  #区域
            else:
                item['city'] = str(job_soup.find(attrs={'class': 't3'}).string)
                item['address'] = ''
            item['jobname'] = ''.join(re.findall('\S+', str(
                job_soup.a.string)))  # 职位名称
            item['company'] = str(job_soup.find(attrs={
                'class': 't2'
            }).string)  # 公司名称
            item['salary'] = str(job_soup.find(attrs={
                'class': 't4'
            }).string)  # 工资
            item['welfare'] = ''  # 福利待遇
            item['education'] = '不限'  # 学历要求
            item['workyear'] = '不限'  # 工作经验
            item['link'] = str(job_soup.a['href'])  # 详情页网址
            item['createtime'] = '2018-' + str(
                job_soup.find(attrs={
                    'class': 't5'
                }).string)
            yield item

        print('正在请求第%s页' % self.page)
        time.sleep(random.randint(1, 10))  # 随机休眠
        if self.page < allpage:  # 判断是否是最后一页
            self.page += 1
            yield scrapy.FormRequest(url=self.base_url + self.kw + ',2,' +
                                     str(self.page) + self.w_url,
                                     headers=self.headers,
                                     method='GET',
                                     callback=self.parse,
                                     dont_filter=True)
            return self.parse
Esempio n. 11
0
    def parse_detail(self, response):
        nodes, driver, kw = response.meta.get("params")
        for node in nodes:
            print(node)
            driver.execute_script("window.open('%s')" % node)
            time.sleep(2)
            driver.switch_to.window(driver.window_handles[1])
            WebDriverWait(driver, timeout=10).until(
                EC.presence_of_element_located(
                    (By.XPATH, "//div[@class='detail-content']")))
            html = etree.HTML(driver.page_source)
            driver.close()
            driver.switch_to.window(driver.window_handles[0])

            item = JobItem()
            item['recruitment_position'] = html.xpath(
                "//div[@class='job-primary detail-box']/div[@class='info-primary']/div[@class='name']/h1/text()"
            )[0]
            item['salary'] = html.xpath(
                "//div[@class='job-primary detail-box']/div[@class='info-primary']/div[@class='name']/span/text()"
            )[0]
            item['keyword'] = kw
            item['url'] = node
            item['source'] = "BOSS直聘"
            item['update_date'] = html.xpath(
                '//div[@class="sider-company"]/p[last()]/text()')[0]
            item['company_name'] = html.xpath(
                '//a[@ka="job-detail-company_custompage"]')[0].attrib.get(
                    'title').strip().replace("\n招聘", "")
            # item['company_name'] = html.xpath('//div[@class="level-list"]/preceding-sibling::div[1]/text()')[0]
            item['work_experience'] = html.xpath(
                '//*[@class="job-primary detail-box"]/div[2]/p/text()')[1]
            item['education_background'] = html.xpath(
                '//*[@class="job-primary detail-box"]/div[2]/p/text()')[2]
            item['job_requirements'] = "".join(
                html.xpath(
                    '//div[@class="detail-content"]/div[@class="job-sec"]/div[@class="text"]/text()'
                ))
            item['company_info'] = "".join(
                html.xpath(
                    '//div[@class="job-sec company-info"]//div[@class="text"]/text()'
                ))
            item['company_address'] = html.xpath(
                '//*[@class="location-address"]/text()')[0]
            item['company_welfare'] = ",".join(
                html.xpath(
                    '//div[@class="job-banner"]/div[@class="job-primary detail-box"]/div[@class="info-primary"]/div[@class="tag-container"]/div[@class="job-tags"]/text()'
                ))
            item['id'] = get_md5(node)
            item['crawl_date'] = datetime.now().strftime("%Y-%m-%d")
            yield item
Esempio n. 12
0
    def parse_job_info(self, response):
        item = JobItem()
        print("parse job info from %s" % response.url)
        item['posted_url'] = response.url
        item['posted_website'] = '前程无忧'

        job_info = response.xpath('//div[@class="cn"]')
        item['company'] = job_info.xpath('p/a/text()').extract_first()
        item['position'] = job_info.xpath('h1/text()').extract_first()
        item['city'] = job_info.xpath(
            './/span[@class="lname"]/text()').extract_first()
        item['salary'] = job_info.xpath('./strong/text()').extract_first()

        company_str = job_info.xpath(
            './p[@class="msg ltype"]/text()').extract_first()
        company_str_list = re.sub('[\t\r\n \xa0]', '', company_str).split('|')
        for company_item in company_str_list:
            if '公司' in company_item or '合资' in company_item \
                or r'外资' in company_item or r'国企' in company_item \
                or r'单位' in company_item:
                item['company_type'] = company_item
            elif '人' in company_item and '0' in company_item:
                item['company_size'] = company_item
            else:
                item['company_industry'] = company_item

        item['experience_requirement'] = response.xpath(
            '//span[@class="sp4"][em[@class="i1"]]/text()').extract_first()
        item['education_requirement'] = response.xpath(
            '//span[@class="sp4"][em[@class="i2"]]/text()').extract_first()
        item['recruiting_number'] = response.xpath(
            '//span[@class="sp4"][em[@class="i3"]]/text()').extract_first()
        item['posted_date'] = response.xpath(
            '//span[@class="sp4"][em[@class="i4"]]/text()').extract_first()
        position_advantage_list = response.xpath(
            '//div[@class="jtag inbox"]/p/span/text()').extract()
        item['position_advantage'] = merge_string(position_advantage_list,
                                                  delimiter=",")

        position_info_list = response.xpath(
            '//div[@class="bmsg job_msg inbox"]/text()').extract()
        item['position_info'] = merge_string(position_info_list)

        item['position_tag'] = "NULL"
        item['department'] = "NULL"
        item['position_type'] = "NULL"
        item["major_requirement"] = "NULL"
        item['company_finance'] = "NULL"
        item['company_url'] = "NULL"
        yield item
Esempio n. 13
0
    def parse(self, response):
        print("request -> " + response.url)
        # 如果发生输入验证码的情况
        if (len(response.url) > 40):
            # 发出叫声
            duration = 10000  # millisecond
            freq = 440  # Hz
            winsound.Beep(freq, duration)
            # 暂停
            os.system("pause")
            return self.this_request()
        print(response.css('i.total_page::text'))
        self.currentpagetotal = int(response.css('i.total_page::text').extract()[0])
        print(self.currentpagetotal)

        job_list = response.css('li.job_item')
        if (len(job_list) > 0):
            print("job58 Nums:" + str(len(job_list)))
            for job in job_list:
                item = JobItem()

                item['Jname'] = job.css('div.job_title > div.job_name > a > span::text').extract()[-1].strip()
                item['Jarea'] = response.css('div.zp_crumb>div.crumb_item>a::text').extract()[0].split('58')[0]
                item['Jtype'] = job.css("div.job_comp > p.job_require > span::text").extract()[0].strip()
                item['Jcompany'] = job.css('div.comp_name > a::text').extract_first().strip()

                item['Jeducation'] = job.css("div.job_comp > p.job_require > span::text").extract()[1].strip()

                item['Jexperience'] = job.css("div.job_comp > p.job_require > span::text").extract()[2].strip().split('年')[0]

                welfare = job.css('div.job_title > div.job_wel > span::text').extract()
                item['Jwelfare'] = ','.join(welfare)

                salary = job.css('div.job_title > p::text').extract()[0].split('-')
                try:
                    item['JminSalary'] = float(salary[0])/1000
                    item['JmaxSalary'] = float(salary[-1])/1000
                except:
                    pass
                # #公司规模
                # item['JcomSize'] = ''
                # #融资情况
                # item['JcomFinanceStage'] = ''
                #爬取的网站名
                item['Jsource']='58同城'
                print(item)
                yield item
            yield self.next_request()
Esempio n. 14
0
File: a51job.py Progetto: MiniOK/job
    def parse(self, response):
        body = response.css(".job-primary")
        for head in body:
            item = JobItem()
            item["title"] = head.css(".job-title::text").extract()[0]
            item["wage"] = head.css(".red::text").extract()[0]
            item["site"] = head.css(".info-primary p::text").extract()[0]
            item["name"] = head.css(".company-text .name a::text").extract()[0]
            item["expert"] = head.css(".info-primary p::text").extract()[1]
            item["edu"] = head.css(".info-primary p::text").extract()[2]
            yield item

        # 翻页
        next_page = response.css(".page .next::attr(href)").extract()[0]
        if next_page is not None:
            yield response.follow("https://www.zhipin.com" + next_page,callback=self.parse)
Esempio n. 15
0
File: cnnb.py Progetto: nenyah/job
 def parse_content(self, response):
     '''
     解析帖子详情
     @param response 下载的网页内容
     yield jobitem 所需结果
     '''
     jobitem = JobItem()
     jobitem['link'] = response.url
     jobitem['author'] = self._get_author(response)
     jobitem['title'] = self._get_title(response)
     jobitem['pub_date'] = self._get_date(response)
     jobitem['content'] = self._get_content(response)
     jobitem['phone'] = self._get_phone(jobitem['content'])
     jobitem['email'] = self._get_email(jobitem['content'])
     jobitem['crawl_date'] = datetime.today().strftime('%Y-%m-%d')
     yield jobitem
Esempio n. 16
0
    def parse(self, response):
        item = JobItem()
        data = json.loads(response.text)
        result = data['data']['results'] #数据合集
        numfound = data['data']['numFound'] #总条数
        for message in result:
            item['city'] = message['city']['items'][0]['name']#城市
            item['jobname'] = message['jobName']#职位名称
            item['company'] = message['company']['name']#公司名称
            item['salary'] = message['salary']#工资
            item['welfare'] = ''
            for i in message['welfare']:
                item['welfare'] += i + ' '#福利待遇
            if len(message['city']['items']) == 2:
                item['address'] = message['city']['items'][1]['name']#区域
            else:
                item['address'] = ''
            item['education'] = message['eduLevel']['name']#学历要求
            item['workyear'] = message['workingExp']['name']#工作经验
            item['link'] = message['positionURL']#详情页网址
            item['createtime'] = message['updateDate']#添加时间
            yield item
        print('正在请求第%s页' % self.page)
        time.sleep(random.randint(1,10))#随机休眠

        if self.page < int(numfound/90) :#判断是否是最后一页
            self.page += 1
            yield scrapy.FormRequest(
                url=self.url,
                headers=self.headers,
                method='GET',
                formdata={
                    'start': str((self.page - 1) * 90),
                    'pageSize': '90',
                    'workExperience': '-1',
                    'education': '-1',
                    'companyType': '-1',
                    'employmentType': '-1',
                    'jobWelfareTag': '-1',
                    'kw': self.kw,
                    'kt': '3',
                    '_v': '0.84269824',
                    'x-zp-page-request-id': '081b0f20bc5a4752a7db7315bce7fc06-1545653322592-595069'
                },
                callback=self.parse
            )
            return self.parse
Esempio n. 17
0
    def parse(self, response):
        zw_div = response.xpath('//div[@class="el"]')

        for item in zw_div:
            item_1 = JobItem()

            item_1['post'] = self.extract_with_xpath(item,'p/span/a/@title')
            item_1['company'] = self.extract_with_xpath(item,'span[@class="t2"]/a/text()')
            item_1['city'] = self.extract_with_xpath(item,'span[@class="t3"]/text()')
            item_1['salary'] = self.extract_with_xpath(item,'span[@class="t4"]/text()')
            item_1['publish_date'] = self.extract_with_xpath(item,'span[@class="t5"]/text()')
            item_1['href'] = self.extract_with_xpath(item,'p[@class="t1 "]/span/a/@href')

            yield item_1

            #分页   //div[@class="dw_page"]//ul/li[last()]/a/@href
        for next_page in response.xpath('//li[@class="bk"][2]/a/@href'):
            yield response.follow(next_page,self.parse)
Esempio n. 18
0
 def parse_detail(self, response):
     jobs = ItemLoader(JobItem(), response=response)
     url_md5 = hashlib.md5()
     url_md5.update(response.url.encode(encoding='utf-8'))
     jobs.add_css('job_name', '.tHjob .cn h1::attr(title)')
     jobs.add_css('company_name', '.tHjob .cname a::attr(title)')
     jobs.add_css('experience', '.tBorderTop_box .jtag .t1 span:nth-child(1)::text')
     jobs.add_css('Education', '.tBorderTop_box .jtag .t1 span:nth-child(2)::text')
     jobs.add_css('workplace', '.tHeader .lname::text')
     jobs.add_css('salary_min', '.tHjob .cn strong::text')
     jobs.add_css('salary_max', '.tHjob .cn strong::text')
     jobs.add_value('url', response.url)
     jobs.add_value('object_url', url_md5.hexdigest())
     jobs.add_css('jb_description', '.tCompany_main div[class=tBorderTop_box] .job_msg')
     jobs.add_css('company_type',  '.tHjob .in .cn .ltype::text')
     jobs.add_css('company_people_min',  '.tHjob .in .cn .ltype::text')
     jobs.add_css('company_people_max', '.tHjob .in .cn .ltype::text')
     jobs.add_css('company_work',  '.tHjob .in .cn .ltype::text')
     return jobs.load_item()
Esempio n. 19
0
 def parse(self, response):
     item = JobItem()
     #存储的数据库
     for post in response.xpath("//article/div"):
         # print(p)
         #职位名称
         # item['Jdb']='demo'
         item['Jname'] = post.xpath(
             './header/h1/a/text()').extract()[0].strip()
         #薪水 已废弃 换成了最低薪资和最高薪资
         # item['Jsalary'] = 0
         # #地区
         # item['Jarea'] = ''
         #值位类型
         item['Jtype'] = post.xpath('./div/text()').extract()[0].strip()
         #职位要求
         # item['Jrequirements'] = ''
         # #公司名称
         # item['Jcompany'] = ''
         # #标签
         # item['Jtag'] = ''
         # #福利
         # item['Jwelfare'] = ''
         # #学历要求
         # item['Jeducation'] = ''
         # #经验要求
         # item['Jexperience'] = ''
         # #最低工资
         # item['JminSalary'] = 0
         # #最高工资 均以k为单位,类型为int
         # item['JmaxSalary'] = 0
         # #每年多少薪资数据类型int
         # item['JpayTimes'] = 0
         # #公司类型
         # item['JcomType'] = ''
         # #招聘人数,字符串
         # item['JhireCount']='0'
         # #公司规模
         # item['JcomSize'] = '500-1000'
         item['JcreatedTime'] = '2020-12-03'
         item['JisSchoolJob'] = 1
         item['Jlocation'] = '(23.45443,135.74434)'
         yield item
Esempio n. 20
0
 def parse_item(self, response):
     item = JobItem()
     # 标题
     item['title'] = response.xpath('/html/head/title/text()').extract()
     addtime = response.xpath(
         '//*[@id="wrapper"]/div[3]/div[1]/div[2]/p/text()').extract()
     item['addtime'] = "".join(addtime).strip()
     # 内容,先使用有图片情况下的匹配规则,如果有内容,返回所有内容的列表集合
     content = response.xpath(
         '//*[@id="wrapper"]/div[3]/div[1]/div[3]/p/text()').extract()
     # 如果没有内容,则返回空列表,则使用无图片情况下的匹配规则
     if len(content) == 0:
         content = response.xpath(
             '//*[@id="wrapper"]/div[3]/div[1]/div[3]/p[1]/text()').extract(
             )
         item['content'] = "".join(content).strip()
     else:
         item['content'] = "".join(content).strip()
     # 链接
     item['url'] = response.url
     yield item
Esempio n. 21
0
 def parse(self, response):
     print response.url
     div = response.xpath("//div[@id='resultList']/div[@class='el']")
     for j in range(0, len(div)):
         item = JobItem()
         jobname = response.xpath(
             "//div[@class='el']/p/span/a/@title").extract()[j]
         companyname = response.xpath(
             "//div[@class='el']/span[@class='t2']/a/@title").extract()[j]
         location = response.xpath(
             "//div[@class='el']/span[@class='t3']/text()").extract()[j]
         pay = response.xpath(
             "//div[@class='el']/span[@class='t4']").extract()[j]
         time = response.xpath(
             "//div[@class='el']/span[@class='t5']/text()").extract()[j]
         item['jobname'] = jobname
         item['companyname'] = companyname
         item['location'] = location
         item['pay'] = pay[17:-7]
         item['time'] = time
         yield item
Esempio n. 22
0
    def parse(self,response):
        for result in response.xpath('//*[@id="main"]/div/div[2]/ul/li'):
            jobName=result.xpath('div/div[1]/h3/a/div[1]/text()').extract()[0]
            salary=result.xpath('div/div[1]/h3/a/span/text()').extract()[0]
            address_during_education = result.xpath('div/div[1]/p/text()').extract()[0]
            company=result.xpath('div/div[2]/div/h3/a/text()').extract()[0]
            url = result.xpath('div/div[1]/h3/a/@href').extract()[0]

            item = JobItem()
            item['jobName']=jobName
            item['salary'] = salary
            item['info'] = address_during_education
            item['company'] = company

            #检索详细信息
            # yield Request(url, callback=self.parse_item, meta={'item': item})

            #翻页
            next_page_url = response.xpath('//*[@id="main"]/div/div[2]/div[2]/a[5]/@href').extract()[0]
            if next_page_url != 'javascript:void(0)':
                yield scrapy.Request(next_page_url, callback=self.parse)
Esempio n. 23
0
    def parse(self, response):
        item = JobItem()
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        result = soup.find_all(attrs={'class': 'list-noimg job-list clearfix new-dl'})
        print(result[1])
        for message in result:
            job_soup = BeautifulSoup(str(message), 'html.parser')
            item['city'] = '北京'  #城市
            item['education'] = '不限'  # 学历要求
            item['workyear'] = '不限'  # 工作经验
            item['jobname'] = job_soup.find(attrs={'list_title gj_tongji'}).string
            item['company'] = job_soup.find(attrs={'new-dl-company'}).a['title']
            item['salary'] = ''.join(re.findall('\S+',str(job_soup.find(attrs={'class':'new-dl-salary'}).string)))
            item['link'] = job_soup.find(attrs={'class': 'list_title gj_tongji'})['href']
            if '-' in str(job_soup.find(attrs={'pub-time'}).span.string):
                item['createtime'] ='2018-' + str(job_soup.find(attrs={'pub-time'}).span.string)
            elif str(job_soup.find(attrs={'pub-time'}).span.string) == '昨天':
                item['createtime'] ='2018-12-25'
            else:
                item['createtime'] ='2018-12-26'
            item['address'] = job_soup.find(attrs={'class':'pay'}).string
            try:
                item['welfare'] =job_soup.find(attrs={'class':'new-dl-tags'}).find('i').string
            except:
                item['welfare'] = ''
            yield item

        print('正在请求第%s页' % self.page)
        time.sleep(random.randint(1, 10))  # 随机休眠
        if self.page < 100:
            self.page += 1
            yield scrapy.FormRequest(
                url='http://bj.ganji.com/zpjisuanjiwangluo/o%s/' % str(self.page),
                method='GET',
                callback=self.parse,
                dont_filter=True
            )
            return self.parse
Esempio n. 24
0
 def parse1(self, response):
     item = JobItem()
     city = response.meta['city']
     city2 = response.meta['city2']  # 用于表格
     ul_list = response.xpath('//*[@id="sidebar-right"]/ul')
     sql = 'create table if not exists {} (title varchar(30) null,' \
           'price varchar(25) null,requie varchar(30) null,' \
           'company varchar(25) null)charset=utf8mb4;'.format(city2)
     creat_table(sql)
     print('CREAT TABLE SUSECESS')
     item['city_table'] = city2
     time.sleep(2)
     for ur in ul_list:
         li_list = ur.xpath('./li')
         for li in li_list:
             print('https://{}.58.com'.format(city) +
                   li.xpath('./strong/a/@href').extract_first())
             yield scrapy.Request(
                 'https://{}.58.com'.format(city) +
                 li.xpath('./strong/a/@href').extract_first(),
                 callback=self.parse2,
                 meta={'item': item})
Esempio n. 25
0
 def parse(self, response):
     item = JobItem()
     elist = response.xpath(
         '//*[@id="resultList"]//div[@class="el"]').extract()
     for i in range(0, len(elist)):
         el = elist[i]
         item['title'] = re.findall('title="(.*?)"', el, re.S)[0]
         item['url'] = re.findall('href="(.*?)"', el, re.S)[0]
         item['company'] = re.findall('title="(.*?)"', el, re.S)[1]
         item['company_url'] = re.findall('href="(.*?)"', el, re.S)[1]
         item['loc'] = re.findall('<span class="t3">(.*?)</span>', el,
                                  re.S)[0]
         item['money'] = re.findall('<span class="t4">(.*?)</span>', el,
                                    re.S)[0]
         item['time'] = re.findall('<span class="t5">(.*?)</span>', el,
                                   re.S)[0]
         print(item)
         yield item
     for p in range(2, 2001):
         url = "https://search.51job.com/list/180200,000000,0000,00,9,99,%2520,2," + str(
             p) + ".html"
         yield scrapy.Request(url, callback=self.parse)
Esempio n. 26
0
    def parse(self, response):
        job_list = response.css('.infoBox ul.infoList.teachinList')
        for job_info in job_list:
            item = JobItem()
            item['company_name'] = job_info.css('li.span1 a::text').extract_first().encode('utf-8')
            item['company_link'] = job_info.css('li.span1 a::attr(href)').extract_first()
            item['briefing_city'] = job_info.css('li.span2::text').extract_first()
            item['school'] = job_info.css('li.span5::text').extract_first()
            item['room'] = job_info.css('li.span4::text').extract_first()
            item['time'] = job_info.css('li.span5::text').extract()[1]

            # yield item
            company_detail = job_info.css('li.span1 a::attr(href)').extract_first()

            if company_detail is not None:
                # company_datail: /teachin/view/id/{company_page_id}
                yield response.follow(url=self.base_url + company_detail, callback=self.detail)

        next_page = response.css('div.ctl ul.page li.next a::attr(href)').extract_first()
        if next_page is not None:
            # next_page: /teachin/index?page={page_num}
            next_page = self.base_url + next_page
            yield scrapy.Request(next_page, callback=self.parse)
Esempio n. 27
0
    def parse_job_info(self, response):
        url = response.url
        site = url.split("//")[1].split('/')[0]
        print('parsing: %s' % response.url)
        if site == 'forbidden.lagou.com':  # exit if blocked
            print("Oh, I'm blocked.")
            return None

        item = JobItem()
        item['url'] = url
        item['company_name'] = response.xpath(
            '//div[@class="company"]/text()').extract_first()
        item['job_position'] = response.xpath(
            '//div[@class="job-name"]/span[@class="name"]/text()'
        ).extract_first()
        job_requests = response.xpath(
            '//dd[@class="job_request"]/p/span/text()').extract()
        for job_request in job_requests:
            if 'k-' in job_request:
                item['salary'] = job_request
            elif '经验' in job_request:
                item['experience_requirement'] = job_request
            elif '学历' in job_request \
                or '大专' in job_request \
                or '本科' in job_request \
                or '硕士' in job_request \
                or '博士' in job_request:
                item['education_requirement'] = job_request
            elif '全职' in job_request \
                or '兼职' in job_request \
                or '实习' in job_request:
                item['job_type'] = job_request
            else:
                item['city'] = re.sub('[\ /]', '', job_request)
        # other field

        yield item
Esempio n. 28
0
    def parse(self, response):
        json_data = response.selector.re(
            r"(window.__SEARCH_RESULT__\s=\s)(\{.*\})")
        json_data = json.loads(json_data[1])
        total_page = json_data['total_page']
        for data in json_data["engine_search_result"]:
            item = JobItem()
            page_job_url = data['job_href']
            print(page_job_url)
            # 职位名
            item['job_name'] = data['job_name']
            # 公司名
            item['company_name'] = data['company_name']
            # 薪资
            item['providesalary_text'] = data['providesalary_text']
            # 投递日期
            item['updatedate'] = data['updatedate']
            # 公司类型
            item['companytype_text'] = data['companytype_text']
            # 公司规模
            item['companysize_text'] = data['companysize_text']
            # 主营业务
            item['companyind_text'] = data['companyind_text']
            # 职位标签
            item['jobwelf'] = data['jobwelf']
            # 职位需求
            item['attribute_text'] = '|'.join(data['attribute_text'])

            yield scrapy.Request(page_job_url,
                                 callback=self.parse_info,
                                 meta={'item': item})
        # page xpath //div[@class='p_in']/span[1]/text()
        for count in range(2, int(total_page) + 1):
            next_url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E7%25A8%258B%25E5%25BA%258F%25E5%2591%2598,2,{}.html?lang=c%2F&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(
                count)
            yield scrapy.Request(next_url, callback=self.parse)
Esempio n. 29
0
    def parse(self, response):
        page_items = response.xpath('//div[@class="el"]')
        for job_item in page_items:
            job = JobItem()
            # job['post'] = job_item.xpath('normalize-space(//p/span/a)').extract()
            job['post'] = job_item.xpath('p/span/a/@title').extract()
            job['company'] = job_item.xpath(
                'span[@class="t2"]/a/text()').extract()
            job['position'] = job_item.xpath(
                'span[@class="t3"]/text()').extract()
            job['salary'] = job_item.xpath(
                'span[@class="t4"]/text()').extract()
            job['time'] = job_item.xpath('span[@class="t5"]/text()').extract()
            if not job['salary']:
                job['salary'] = "商议决定"
            url = job_item.xpath('p/span/a/@href').extract()
            if url:
                #二级页面
                yield scrapy.Request(url[0],
                                     meta={'item': job},
                                     callback=self.parse_msg,
                                     dont_filter=True)  #dont_filter不用过滤二级页面

        pass
Esempio n. 30
0
 def parse_items(self, response):
     # print response.body.decode('gb2312')
     print '开始处理item'
     item = JobItem()
     url = response.url
     print url
     # lambda表达式,如果页面有数据,就获取,没有就赋值为空
     f = lambda x: x[0] if x else ''
     company = response.xpath('//p[@class="cname"]/a/@title').extract()  # 公司
     # 判断是否为职位页面,有company就是具体职位页面,否者是公司页面
     if company:
         company = company[0]
         # print company
         position = response.xpath('//h1/text()').extract()[0]  # 职位名称
         # print position
         salary = response.xpath('//div[@class="cn"]//strong/text()').extract()[0]  # 薪水
         location = response.xpath('//span[@class="lname"]/text()').extract()[0].strip()  # 地区
         work_address = location
         # print location
         work_years = f(response.xpath('//div[@class="t1"]//span[1]/text()').extract())  # 工作经验
         # print work_years
         # 判断是否有学历要求,没有标明的话赋值不限学历
         ie = response.xpath('//span/em[@class="i2"]').extract()
         if ie:
             degree = response.xpath('//div[@class="t1"]//span[2]/text()').extract()[0]  # 学历
             # print degree
         else:
             degree = '不限学历'
             # print degree
         position_type = response.xpath('//p[@class="msg ltype"]/text()').extract()[0].strip()  # 职位类别
         print response.xpath('//p[@class="msg ltype"]/text()').extract()[0]
         # 处理职位类别 去掉|和空格
         position_type = position_type.split('|')
         list_type = []
         for i in position_type:
             i = i.strip()
             list_type.append(i)
         position_type = list_type
         print position_type
         tags = f(response.xpath('//p[@class="t2"]/span/text()').extract())  # 标签
         print tags
         pub_date = response.xpath('//div[@class="t1"]//span/text()').extract()  # 发布日期
         # print str(pub_date)
         date = re.compile(r'\d{2}-\d{2}')
         datep = date.findall(str(pub_date))
         # print datep[0]
         desc = response.xpath('//div[@class="bmsg job_msg inbox"]//p/text()').extract()
         # 判断职位详情的html,有的是p标签,有的没有
         if desc:
             position_desc1 = desc
         else:
             position_desc1 = response.xpath('//div[@class="bmsg job_msg inbox"]/text()').extract()  # 职位简介
         position_desc = []
         # 处理职位详情
         for i in position_desc1:
             i = i.strip()
             print i
             if i != '':
                 position_desc.append(i.strip())
         print position_desc
         # work_address = ''.join(response.xpath('//div[@class="bmsg inbox"]/a/@onclick').extract()) # 工作地址
         # if len(work_address) > 1:
         #     adds = work_address.split(',')[1]
         #     work_address = adds.split(')')[0]
         #     print work_address
         # else:
         #     work_address = work_address[0]
         item['url'] = self.md5(url)
         item['company'] = company
         item['tags'] = tags
         item['position'] = position
         item['salary'] = salary
         item['degree'] = degree
         item['location'] = location
         item['work_years'] = work_years
         item['position_type'] = position_type
         item['pub_date'] = datep[0]
         item['position_desc'] = position_desc
         item['work_address'] = work_address
         print item
         yield item
     else:
         pass