Ejemplo n.º 1
0
    def parse(self, response):
        jsonBody = json.loads(response.body.decode())
        results = jsonBody['content']['positionResult']['result']

        items = []

        for result in results:
            item = LagouItem()
            item['job'] = result['positionName']
            item['address'] = result['city']
            item['money'] = result['salary']
            item['req'] = result['education'] + '/' + result['workYear']
            item['company'] = result['companyFullName']
            item[
                'qua'] = result['industryField'] + '/' + result['financeStage']
            item['des'] = ','.join(result['companyLabelList'])

            items.append(item)
        return items
Ejemplo n.º 2
0
    def parse(self, response):
        print("我执行了!!!!!!!!!!!!!!!!")
        self.i += 1
        print(self.i)
        item = LagouItem()
        if response.status != 200:
            print(response.status)
            self.status = response.status
            item['url'] = response.meta['url']
            item['code'] = self.status
            item['keyword'] = self.keyword
            return item

        html = pq(response.text)
        self.status = response.status
        item['code'] = self.status
        item['keyword'] = self.keyword
        item['id'] = response.meta['id']
        try:
            address_list = html(".work_addr").children()
            address = ""
            for temp in address_list:
                address += temp.text
            item['address'] = address
            item['url'] = response.meta['url']
            item['advantage'] = html(".job-advantage p").text()
            item['company'] = html(".company").text()
            description_list = html(".job_bt p")
            description = str(description_list).replace("<p>", " ").replace(
                "</p>", " ").replace("<br/>", " ")
            item['description'] = description
            job_info = html(".job_request p span")
            item['salary'] = job_info[0].text
            item['location'] = job_info[1].text.replace("/", "")
            item['experience'] = job_info[2].text.replace("/", "")
            item['education'] = job_info[3].text
            item['type'] = job_info[4].text
            item['label'] = html(".position-label li").text()
            item['name'] = html(".job-name").attr('title')
        except Exception as e:
            print(item['url'], e)
        return item
Ejemplo n.º 3
0
    def parse(self, response):
        item = LagouItem()
        tag_url = response.xpath('//li[@class="wrapper"]//div/ul/li/a/@href')
        next_page = response.xpath(
            '//div[@class="pager_container"]/span[last()]/@class')
        for each in response.xpath('//li[@class="company-item"]'):
            lg_company_url = each.xpath('div/p[1]/a/@href').extract_first()
            tag = each.xpath(
                'div/p[@class="indus-stage wordCut"]/text()').extract_first()
            item['tag'] = tag
            item['lg_company_url'] = lg_company_url
            # for i in range(1,21):

            #print(item,next_page)
        # for url in tag_url:
        # 	yield scrapy.Request(next_page, callback=self.parse)
        # print(lg_company_url)
        yield scrapy.Request(url='https://www.lagou.com/gongsi/10483.html',
                             headers=self.headers,
                             callback=self.company_page,
                             meta={'item': item})
Ejemplo n.º 4
0
 def parseDetail(self , response):
     print(response.status)
     if response.status == 200:
         Item = LagouItem();
         print("----------------------------------------------------------------------------------------")
         #公司名
         Item["companyName"] = response.xpath("/html/body/div[2]/div/div[1]/div/div[1]/text()").extract();
         #职位名
         Item["positionName"] = response.xpath("/html/body/div[2]/div/div[1]/div/span/text()").extract();
         #职位简介
         Item["positionIntro"] = response.xpath("/html/body/div[2]/div/div[1]/dd/p[1]/span/text()").extract();
         #职位标签
         Item["positionLabel"] = response.xpath("/html/body/div[2]/div/div[1]/dd/ul/li/text()").extract();
         #职位职责
         Item["workResponsibility"] = response.xpath("//*[@id='job_detail']/dd[2]/div/p/text()").extract();
         #职位福利
         # //*[@id="job_detail"]/dd[1]/p
         Item["positionWelfare"] = response.xpath("//*[@id='job_detail']/dd[1]/p/text()").extract();
         # 职位地址
         Item["workAddress"] = response.xpath("//*[@id='job_detail']/dd[3]/div[1]/a/text()").extract();
         yield Item;
Ejemplo n.º 5
0
    def parse(self, response):
        # print response.body
        # fp = open('1.html', 'w')
        # fp.write(response.body)
        # fp.close()
        jdict = json.loads(response.body)
        jcontent = jdict['content']
        jposresult = jcontent["positionResult"]
        jresult = jposresult["result"]

        self.totalPageCount = jposresult['totalCount'] / 15 + 1
        print self.totalPageCount

        item = LagouItem()

        for each in jresult:
            # print each['city']
            # print each['companyFullName']
            # print each['companySize']
            # print each['positionName']
            # print each['firstType']
            # print each['salary']
            # print ''
            item['city'] = each['city']
            item['companyFullName'] = each['companyFullName']
            item['companySize'] = each['companySize']
            item['positionName'] = each['positionName']
            item['firstType'] = each['firstType']
            item['salary'] = each['salary']
            yield item

        if self.curpage <= self.totalPageCount:
            self.curpage += 1
            yield scrapy.http.FormRequest(self.reqUrl,
                                          formdata={
                                              'pn': str(self.curpage),
                                              'kd': self.kd
                                          },
                                          callback=self.parse)
Ejemplo n.º 6
0
 def parse(self, response):
     #inspect_response(response, self)
     print("request -> " + response.url)
     html = json.loads(response.text)
     #except ValueError:
     #yield self.next_request(response)
     if (html.get("success")):
         if html.get('content').get('positionResult').get(
                 'resultSize') != 0:
             results = html.get('content').get('positionResult').get(
                 'result')
             print('lagou Nums:' + str(len(results)))
         for result in results:
             item = LagouItem()
             item['salary'] = result.get('salary').replace("k", "K")
             item['positionName'] = result.get('positionName')
             item['positionLables'] = result.get('positionLables')
             item['companyFullName'] = result.get('companyFullName')
             item['companyLabelList'] = result.get('companyLabelList')
             item['companySize'] = result.get('companySize')
             item['city'] = result.get('city')
             item['district'] = result.get('district')
             item['education'] = result.get('education')
             item['firstType'] = result.get('firstType')
             item['industryField'] = result.get('industryField')
             item['jobNature'] = result.get('jobNature')
             item['workYear'] = result.get('workYear')
             yield item
         totalPage = math.floor(
             int(
                 html.get('content').get('positionResult').get(
                     "totalCount")) /
             int(html.get('content').get("pageSize")))
         self.curPage = self.curPage + 1
         if (self.curPage <= totalPage):
             yield self.next_request(response)
     else:
         time.sleep(60)
         yield self.next_request(response)
Ejemplo n.º 7
0
    def parse_item(self, response):
        item = LagouItem()
        item['position_name'] = response.xpath(
            '//div[@class="job-name"]//span[@class="name"]/text()').extract(
            )[0]
        # 公司名
        item['company'] = response.xpath(
            '//div[@class="company"]/text()').extract()[0]
        # 薪水
        item['salary'] = response.xpath(
            '//dd[@class="job_request"]//span[1]/text()').extract()[0]
        # 经验
        item['experience'] = response.xpath(
            '//dd[@class="job_request"]//span[3]/text()').extract()[0]
        # 学历
        item['education'] = response.xpath(
            '//dd[@class="job_request"]//span[4]/text()').extract()[0]
        # 工作地点
        item['location'] = response.xpath(
            '//input[@name="positionAddress"]/@value').extract()[0]

        yield item
Ejemplo n.º 8
0
 def parse_item(self, response):
     i = LagouItem()
     url = response.url
     print(url)
     i["job_id"]
     i["title"] = response.xpath(
         '//div[@class="position-content"]/div/div[@class="job-name"]/@title'
     )
     i["url"] = response.url
     i["salary"] = response.xpath(
         '//dd[@class="job_request"]/sapn[@class="salary"]/text()')
     i["job_city"] = response.xpath('//dd[@class="job_request"]/span')[1]
     i["work_years"] = response.xpath('//dd[@class="job_request"]/span')[2]
     i["degree_need"] = response.xpath('//dd[@class="job_request"]/span')[3]
     i["job_type"] = response.xpath('//dd[@class="job_request"]/span')[4]
     i["publish_time"] = response.xpath(
         '//dd[@class="job_request"]/p[@class="publish_time"]/text()')
     i["job_advantage"] = response.xpath(
         '//dd[@class="job-advantage"]/p/text()')
     i["job_desc"] = response.xpath('//dd[@class="job_bt"]/div/p/text()')
     city = response.xpath(
         '//dd[@class="job-address clearfix"]/div[@class="work_addr"]/a/text()'
     )[0]
     district = response.xpath(
         '//dd[@class="job-address clearfix"]/div[@class="work_addr"]/a/text()'
     )[1]
     road = response.xpath(
         '//dd[@class="job-address clearfix"]/div[@class="work_addr"]/a/text()'
     )[2]
     address = city + district + road
     i["job_addr"] = address
     i["company_url"] = response.xpath(
         '//div[@class="content_r"]/d1[@class="job_company"]/dt/a/@href')
     i["company_name"] = response.xpath(
         '//div[@class="content_r"]/d1[@class="job_compony"]/dt/img/@alt')
     #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     #i['name'] = response.xpath('//div[@id="name"]').extract()
     #i['description'] = response.xpath('//div[@id="description"]').extract()
     return i
Ejemplo n.º 9
0
    def parse(self, response):
        response_body = json.loads(response.text)
        for compony in response_body["content"]["positionResult"]["result"]:
            lagouitem = LagouItem()
            # print "*" * 40
            # print u"公司编号" + str(compony["companyId"])
            # print compony["positionName"]
            # print compony["workYear"]
            # print compony["education"]
            # print compony["positionId"]
            # print compony["salary"]
            # print compony["companyFullName"]

            lagouitem["company_id"] = compony["companyId"]
            lagouitem["position_name"] = compony["positionName"]
            lagouitem["work_year"] = compony["workYear"]
            lagouitem["education"] = compony["education"]
            lagouitem["position_id"] = compony["positionId"]
            lagouitem["salary"] = compony["salary"]
            lagouitem["cmpany_full_name"] = compony["companyFullName"]
            print lagouitem
            yield lagouitem
Ejemplo n.º 10
0
 def parse(self, response):
     pagecode = json.loads(
         response.body)['content']['positionResult']['result']
     item = LagouItem()
     for job in pagecode:
         item['jobname'] = job['positionName']
         item['releasetime'] = job['createTime']
         item['salary'] = job['salary']
         item['companyname'] = job['companyFullName']
         item['experience'] = job['workYear']
         item['Education'] = job['education']
         yield item
     #获取下一页链接
     for i in range(30):
         post_data = {
             'first': 'true',
             'kd': 'python',
             'pn': '{}'.format(i),
             'city': u'上海'
         }
         yield FormRequest(url=self.joburl,
                           formdata=post_data,
                           callback=self.parse)
Ejemplo n.º 11
0
    def parse(self, response):
        item = LagouItem()
        divs = response.xpath('//*[@id="s_position_list"]/ul/li/div[1]')
        for div in divs:
            title = div.xpath('./div[1]/div[1]/a/h3/text()').extract()
            address = div.xpath('./div[1]/div[1]/a/span/em/text()').extract()
            money = div.xpath('./div[1]/div[2]/div/span/text()').extract()
            company = div.xpath('./div[2]/div[1]/a/text()').extract()
            fintance = div.xpath('./div[2]/div[2]/text()').extract()

            job_title = title[0] if len(title) > 0 else '无数据'
            job_address = address[0] if len(address) > 0 else '无数据'
            job_money = money[0] if len(money) > 0 else '无数据'
            job_company = company[0] if len(company) > 0 else '无数据'
            job_fintance = fintance[0] if len(fintance) > 0 else '无数据'

            item['title'] = job_title.strip()
            item['address'] = job_address.strip()
            item['money'] = job_money.strip()
            item['company'] = job_company.strip()
            item['fintance'] = job_fintance.strip()

            yield item
Ejemplo n.º 12
0
    def parse_url(self, response):

        jobclass = response.meta["jobClass"]

        for sel in response.xpath("//ul[@class='item_con_list']/li"):
            # 初始化
            Item = LagouItem()

            jobname = sel.xpath("div/div/div/a/h3/text()").extract()
            jobmoney = sel.xpath("div/div/div/div/span/text()").extract()
            jobneed = sel.xpath("div/div/div/div/text()").extract()
            jobneed = jobneed[2].strip()

            jobcompany = sel.xpath("div/div/div/a/text()").extract()
            jobcompany = jobcompany[3].strip()

            jobplace = sel.xpath("div/div/div/a/span/em/text()").extract()

            jobtype = sel.xpath("div/div/div/text()").extract()
            jobtype = jobtype[7].strip()

            jobspesk = sel.xpath(
                "div[@class='list_item_bot']/div/text()").extract()
            jobspesk = jobspesk[-1].strip()

            Item['jobClass'] = jobclass

            Item['jobName'] = jobname
            Item['jobMoney'] = jobmoney
            Item['jobNeed'] = jobneed
            Item['jobCompany'] = jobcompany
            Item['jobPlace'] = jobplace
            Item['jobType'] = jobtype
            Item['jobSpesk'] = jobspesk

            yield Item
            pass
Ejemplo n.º 13
0
    def parse(self, response):

        content = json.loads(response.text)['content']

        # print(json.load(response.text))

        page = content['pageNo']
        self.success_pages.append(page)

        #解析数据,提取需要的数据
        results = content['positionResult']['result']
        for result in results:
            job = LagouItem()
            job['job_name'] = result['positionName']
            job['job_addr'] = result['district']
            job['job_time'] = result['createTime']
            job['job_limit'] = result['education'] + '、' + result['workYear']
            job['job_salary'] = result['salary']
            job['job_company'] = result['companyFullName']
            job['job_company_type'] = result['financeStage'] + '、' + result[
                'industryField']
            job['job_vip'] = result['positionAdvantage']
            job['page'] = page
            yield job
Ejemplo n.º 14
0
    def parse_item(self, response):
        print(response.url)
        item = LagouItem()
        job_url = response.url  #路径
        job_comp = response.css('#job_company dt a img::attr(alt)').extract()[
            0]  #公司名
        print()
        job_name = response.xpath('//div[@class="job-name"]/@title').extract()[
            0]  # 工作名
        job_degree = response.xpath(
            '//dd[@class="job_request"]/p/span[4]/text()').extract()[0].rstrip(
                ' /')  # 学历
        money = response.xpath(
            '//dd[@class="job_request"]/p/span[@class="salary"]/text()'
        ).extract()[0]
        if '-' in money:
            job_smoney = money.lower().replace('k', '').split('-')[0]  # 工资最小数
            job_emoney = money.lower().replace('k', '').split('-')[1]  # 工资最大数
        else:
            job_smoney = 0
            job_emoney = 0
        job_address = response.xpath(
            '//dd[@class="job_request"]/p/span[2]/text()').extract()[0].lstrip(
                '/').rstrip(' /')  # 工作地址
        job_comp_type = response.xpath(
            '//ul[@class="c_feature"]/li[2]/text()').extract()[1].replace(
                r'\n ', '')  #公司类型
        job_business = response.xpath(
            '//ul[@class="c_feature"]/li[1]').extract()[0].split(
                r'</i>')[1].split('<span class="hovertips">')[0].replace(
                    r'\n', '')
        job_date_pub = response.xpath('//p[@class="publish_time"]/text()'
                                      ).extract()[0].split(' ')[0]  #发布时间
        job_num = response.xpath(
            '//ul[@class="c_feature"]/li[3]/text()').extract()[1]
        if '-' in job_num:
            job_comp_snum = job_num.split('-')[0]  # 公司最小人数
            job_comp_enum = job_num.split('-')[1].replace('人', '')  # 公司最大人数
        elif '以上' in job_num:
            job_comp_snum = job_num.replace('人以上', '')
            job_comp_enum = job_comp_snum
        else:
            job_comp_snum = 0
            job_comp_enum = 0

        job_year = response.xpath(
            '//dd[@class="job_request"]/p/span[3]/text()').extract()[0]

        if '-' in job_year:
            job_syear = job_year.split('-')[0].replace('经验', '')  # 最小经验年限
            job_eyear = job_year.split('-')[1].replace('年 /', '')  # 最大经验年限
        elif '以上' in job_year:
            job_syear = job_year.replace('年以上', '').lstrip('经验')
            job_eyear = job_syear
        else:
            job_syear = 0
            job_eyear = 0
        job_datetime = datetime.datetime.now().strftime('%Y-%m-%d')  # 爬取时间
        job_welfafe = response.xpath(
            '//dd[@class="job-advantage"]/p/text()').extract()[0]  # 公司福利
        job_people = ''  # 招聘人数
        job_desc = response.xpath(
            '//dd[@class="job_bt"]//p/text()').extract()  # 工作简介
        job_desc = str(job_desc).lstrip('[').rstrip(']').replace(r'\xa0', '')
        job_request = ''  # 工作要求
        job_tag = response.xpath(
            '//li[@class="labels"]/text()').extract()  # 爬取种类分类
        job_tag = ','.join(job_tag)
        # print(job_business,job_tag)

        item['job_name'] = job_name
        item['job_degree'] = job_degree
        item['job_smoney'] = job_smoney
        item['job_emoney'] = job_emoney
        item['job_address'] = job_address
        item['job_comp_snum'] = job_comp_snum
        item['job_comp_enum'] = job_comp_enum
        item['job_syear'] = job_syear
        item['job_eyear'] = job_eyear
        item['job_datetime'] = job_datetime
        item['job_welfafe'] = job_welfafe
        item['job_people'] = job_people
        item['job_desc'] = job_desc
        item['job_request'] = job_request
        item['job_tag'] = job_tag
        item['job_url'] = job_url
        item['job_comp'] = job_comp
        item['job_comp_type'] = job_comp_type
        item['job_business'] = job_business
        item['job_date_pub'] = job_date_pub

        yield item
Ejemplo n.º 15
0
    def parse(self, response):
        keyword = response.url.split('kd=')[1]  # 从url中获取keyword
        pageNo = 0
        try:
            json_data = json.loads(response.body.decode('utf-8'))
            # print('json_data:', json_data)

            pageNo = json_data['content']['pageNo']  # 当前页数
            if pageNo > 0:
                print('\n start 第%s页' % str(pageNo))
            results = json_data['content']['positionResult']['result']
            # print('results: ', results)
            # 将json数据解析放入item中
            for result in results:
                # print('result: ', result)
                item = LagouItem()
                item['spider'] = self.name
                item['keyword'] = keyword
                item['jobId'] = str(result['positionId'])
                item['jobName'] = result['positionName']
                item['jobPlace'] = result['city']
                item['jobSalary'] = result['salary']
                item['jobAdvantage'] = result[
                    'positionAdvantage']  # 岗位优势(福利待遇)
                item['releaseTime'] = result['createTime']  # 发布时间
                item['educationRequire'] = result['education']  # 最低学历要求
                item['experienceRequire'] = result['workYear']  # 工作经验要求
                item['jobNature'] = result[
                    'jobNature']  # 工作性质('全职'、‘兼职’or’实习‘等)
                lt = result['positionLables']
                lt.append(keyword)
                item['jobLabels'] = lt  # 岗位标签,如:部门主管、数据分析等

                item['compId'] = str(result['companyId'])
                item['compName'] = result['companyFullName']
                item['compSize'] = result['companySize']  # 公司规模
                item['compIndustry'] = result['industryField']  # 公司所属行业
                item['compLabels'] = result['companyLabelList']  # 公司标签

                item['longitude'] = result['longitude']  # 经度
                item['latitude'] = result['latitude']  # 纬度
                item['businessZones'] = result['businessZones']  # 更具体的位置
                item['compLogo'] = result['companyLogo']  # 公司logo
                item['financeStage'] = result['financeStage']  # 公司融资阶段

                job_desc_url = 'https://www.lagou.com' + '/jobs/' + item[
                    'jobId'] + '.html'
                comp_desc_url = 'https://www.lagou.com' + '/gongsi/' + item[
                    'compId'] + '.html'
                item['jobLink'] = job_desc_url
                item['compLink'] = comp_desc_url
                yield scrapy.Request(url=job_desc_url,
                                     headers=self.headers,
                                     callback=self.job_desc_parse,
                                     meta={'item': item})
        except Exception as e:
            if pageNo == 0:
                print('超出页面范围')
            else:
                print('error_page:', pageNo)
                print('error_info:', e)
Ejemplo n.º 16
0
 def parse(self, response: Response):
     """
     解析并填充item
     :param response:
     :return:
     """
     res_json = response.text
     extract_status = Compose(json.loads, SelectJmes("status"))
     status = extract_status(res_json)
     if status is None:
         # 成功的情况
         extract_result = Compose(json.loads, SelectJmes("content"),
                                  SelectJmes("positionResult"),
                                  SelectJmes("result"))
         result_list = extract_result(response.text)
         for res in result_list:
             loader = LagouItemLoader(item=LagouItem())
             loader.add_value("post_time", res)
             loader.add_value("job_name", res)
             loader.add_value("salary", res)
             loader.add_value("place", res)
             loader.add_value("job_nature", res)
             loader.add_value("experience", res)
             loader.add_value("education", res)
             loader.add_value("job_kind", res)
             loader.add_value("advantage", res)
             loader.add_value("company_name", res)
             loader.add_value("company_size", res)
             loader.add_value("company_industry", res)
             loader.add_value("id", res)
             loader.add_value(
                 "link",
                 self.job_detail_url.format(
                     id=loader.get_output_value("id")))
             this_item = loader.load_item()
             yield this_item
             # yield Request(
             #     url=this_item.get("link"),
             #     headers=self.header_dict,
             #     meta={"cookiejar": uuid.uuid4(), "item": this_item},
             #     callback=self.parse_other,
             #     priority=5,
             # )
     else:
         # 若请求失败,则重新请求一个主页,获得cookies,然后再次发起请求
         key = uuid.uuid4()
         yield Request(url=self.get_cookies_url,
                       callback=self.empty,
                       meta={"cookiejar": key},
                       headers=self.header_dict,
                       priority=5,
                       dont_filter=True)
         yield FormRequest(url=response.url,
                           formdata={
                               "first": "true",
                               "pn": str(response.meta['page']),
                               'kd': ""
                           },
                           callback=self.parse,
                           meta={
                               "cookiejar": key,
                               "page": response.meta['page']
                           },
                           method="POST",
                           headers=self.header_dict,
                           priority=4,
                           dont_filter=True)
Ejemplo n.º 17
0
    def parse(self, response):
        json_data = response.text
        # logger.debug(json_data)
        if '<html>' not in json_data:
            data = json.loads(json_data, object_pairs_hook=OrderedDict)
            # logger.debug(data)
            if data['success'] is True:
                # Json里面的页码
                self.page_no = data['content']['pageNo']
                while self.page_no != 0:
                    # 解析
                    result = data['content']['positionResult']['result']
                    for item in result:
                        lagou_data = LagouItem()
                        # try:

                        # MongoDB ID 虽然我这里没用MongoDB
                        lagou_data['_id'] = string_to_md5(
                            string=str(item['companyId']) +
                            str(item['positionId']))
                        lagou_data['from_website'] = "拉勾"

                        # 薪资(最高最低)
                        try:
                            salary = item.get('salary').split('-')
                            lagou_data['min_salary'] = salary[0]
                            lagou_data['max_salary'] = salary[1]
                        except IndexError:
                            salary = item.get('salary').split('以上')
                            lagou_data['min_salary'] = salary
                            lagou_data['max_salary'] = '不限'

                        # 工作地址
                        try:
                            lagou_data[
                                'location'] = item['city'] + item['district']
                        except TypeError:
                            lagou_data['location'] = item['city']
                        # 发布时间
                        lagou_data['publish_date'] = int(
                            time_to_timestamp(time_str=item['createTime']))
                        # 职位类型
                        lagou_data['work_type'] = item['jobNature']
                        # 工作年限
                        lagou_data['work_experience'] = item['workYear']
                        # 教育水平
                        lagou_data['limit_degree'] = item['education']
                        # 招聘人数
                        lagou_data['people_count'] = 0
                        # 职位名称
                        lagou_data['work_name'] = item['positionName']
                        # 工作职责
                        lagou_data['work_duty'] = ""
                        # 工作需求
                        lagou_data['work_need'] = ""
                        # 公司名称
                        lagou_data['business_name'] = item['companyFullName']
                        # 公司状态
                        lagou_data['business_type'] = item['financeStage']
                        # 公司人数规模
                        lagou_data['business_count'] = item['companySize']
                        # 公司行业类别
                        lagou_data['business_industry'] = item['industryField']
                        # 公司页面url
                        company_url = 'https://www.lagou.com/gongsi/%s.html' % item[
                            'companyId']
                        # 招聘信息页面url
                        job_url = 'https://www.lagou.com/jobs/%s.html' % item[
                            'positionId']

                        # 职位页面
                        lagou_data['work_info_url'] = job_url

                        yield scrapy.Request(url=job_url,
                                             method="GET",
                                             cookies=ALL_COOKIES,
                                             callback=self.parse_job_info,
                                             headers=HEADERS,
                                             meta={
                                                 "lagou_data": lagou_data,
                                                 "company_url": company_url
                                             },
                                             dont_filter=False)

                        # except TypeError as err:
                        #      logger.error(err)
                        # 翻页
                        url = self._url.format(self._search_name,
                                               self.page_no + 1)
                        yield scrapy.Request(url=url,
                                             method="GET",
                                             callback=self.parse)
                else:
                    raise CloseSpider(reason="End of Page num!")
Ejemplo n.º 18
0
    def parse(self, response):
        # print response.body

        # print '**************************************'
        # return [FormRequest(url=self.urls, formdata={'first':'false','pn':'2','kd':'Python'}, callback=self.after_post)]

        # # print aa.encode('utf-8')
        # # print aa
        # try:
        #     print type(aa)
        #     # print aa.response.body
        # except:
        #     print 'chucuo'
        # print '-----------------------------------------'

        # return [FormRequest(url="http://www.example.com/post/action", formdata={'name': 'John Doe', 'age': '27'}, callback=self.after_post)]
        # filename = response.url.split("/")[-2]
        # with open(filename, 'wb') as f:
        #     f.write(response.body)

        item = LagouItem()
        # items = []
        infojson = json.loads(response.body)
        result = infojson['content']['positionResult']['result']
        if result:
            # print result
            # self.for_result(result)
            for i in result:
                # print i
                item['companySize'] = i['companySize']
                # print companySize
                # print i['companySize']
                item['firstType'] = i['firstType']
                item['appShow'] = i['appShow']
                item['pcShow'] = i['pcShow']
                item['positionName'] = i['positionName']
                item['education'] = i['education']
                item['financeStage'] = i['financeStage']
                item['city'] = i['city']
                item['companyLogo'] = i['companyLogo']
                item['district'] = i['district']
                item['companyId'] = i['companyId']
                item['explain'] = i['explain']
                item['industryField'] = i['industryField']
                item['createTime'] = i['createTime']
                item['positionLables'] = i['positionLables']
                item['score'] = i['score']
                item['adWord'] = i['adWord']
                item['formatCreateTime'] = i['formatCreateTime']
                item['salary'] = i['salary']
                item['workYear'] = i['workYear']
                item['lastLogin'] = i['lastLogin']
                item['jobNature'] = i['jobNature']
                item['deliver'] = i['deliver']
                item['gradeDescription'] = i['gradeDescription']
                item['imState'] = i['imState']
                item['companyFullName'] = i['companyFullName']
                item['companyLabelList'] = i['companyLabelList']
                item['positionId'] = i['positionId']
                item['companyShortName'] = i['companyShortName']
                item['approve'] = i['approve']
                item['businessZones'] = i['businessZones']
                item['plus'] = i['plus']
                item['secondType'] = i['secondType']
                item['positionAdvantage'] = i['positionAdvantage']
                item['publisherId'] = i['publisherId']
                item['promotionScoreExplain'] = i['promotionScoreExplain']
                yield item

            self.pn += 1
            while 1:
                first = 'true' if 1 == self.pn else 'false'
                # print first
                # print pn
                # formdata={'first':''+first+'','pn':''+str(pn)+'','kd':'Python'}
                # print formdata
                # self.parse()
                # print '1222222222222222222222222222222''kd':'Python'
                yield FormRequest(url=self.urls,
                                  headers=self.headers,
                                  formdata={
                                      'first': first,
                                      'pn': str(self.pn),
                                  },
                                  callback=self.parse)
        else:
            print '-----------------------------------------------'
            print 'haoxiangmeiyoule'
            print '-----------------------------------------------'
            return
Ejemplo n.º 19
0
    def parse_item(self, response):
        # l = ItemLoader(item=LagouItem(), response=response)
        self.logger.info('Hi, this is an item page! %s', response.url)
        item = LagouItem()
        htmls = response.body
        # 公司名称
        co_name = response.xpath(
            "//div[@class='company']/div[@class='company_name']/a/text()").get(
            )
        # 职位名称
        name = response.xpath(
            "//li[@class='con_list_item default_list']//div[@class='p_top']//h3/text()"
        ).get()
        # 薪资
        salary = response.xpath(
            "//div[@class='postion']//div[@class='p_bot']//span[@class='money']/text()"
        ).get()
        # 区域
        area = response.xpath(
            "//div[@class='postion']//div[@class='p_bot']//span[@class='add']/em/text()"
        ).get()
        # 工作年限
        exp = response.xpath(
            "//div[@class='postion']//div[@class='p_bot']//div[@class='li_b_l']/text()"
        ).get()
        # 学历
        edu = response.xpath(
            "//div[@class='postion']//div[@class='p_bot']//div[@class='li_b_l']/text()"
        ).get()
        # 发布时间
        time = response.xpath(
            "//div[@class='p_top']//span[@class='format-time']/text()").get()
        time = self.getVal(time)
        # 职位描述
        info = response.xpath(
            "//li[@data-positionname=$val]//div[@class='list_item_bot']/div[@class='li_b_l']/span/text()",
            val=name).getall()
        info = ";".join(info)
        print("info" * 20, info)
        # info = self.getVal(info)
        # if info != '':
        #     info = '\n'.join(info).encode('utf-8')

        # 工作地点
        local = response.xpath(
            "//div[@class='postion']//div[@class='p_bot']//span[@class='add']//em/text()"
        ).get()
        # 公司福利
        welfare = response.xpath(
            "//div[@class='list_item_bot']//div[@class='li_b_r']/text()").get(
            )
        # 公司网址
        co_url = ""
        # 招聘人数
        num = '0'
        # 公司类别
        co_type = response.xpath('//dl[@class="industry"]/text()').get()
        # print("++" * 80)
        # print(name, co_name, area, salary, exp, edu, num, time, welfare, info, local, co_url, co_type)
        item['name'] = name
        item['co_name'] = co_name
        item['area'] = area
        item['salary'] = salary
        item['exp'] = exp
        item['edu'] = edu
        item['num'] = num
        item['time'] = time
        item['welfare'] = welfare
        item['info'] = str(info)
        item['local'] = local
        item['co_url'] = co_url
        item['co_type'] = co_type

        # print('+====' * 80, item)
        # itemJson = json.dumps(item, default=obj2json)
        # print(itemJson)
        yield item
    def parse(self, response):
        """
        json.loads()将json格式转为python格式,按照html响应的json文件格式进行字典引用
        增加time.sleep()是防止出现KeyError: "content"错误, 应该是爬取页面太快, ajax未成功加载出
        """
        #time.sleep(10)
        jdict = json.loads(response.body)
        #time.sleep(10)
        try:
            jcontent = jdict["content"]["positionResult"]
        except Exception, e:
            print repr(e) + "?" * 30
            jdict = json.loads(response.body)
            jcontent = jdict["content"]["positionResult"]
        finally:
            jresult = jcontent["result"]
            for each in jresult:
                item = LagouItem()
                item['city'] = each["city"].encode('utf-8')
                item['company'] = each["companyFullName"].encode('utf-8')
                item['size'] = each["companySize"].encode('utf-8')
                item['zone'] = each["district"].encode('utf-8')
                item['createtime'] = each["createTime"].encode('utf-8')
                item['labels'] = each["positionLables"].encode('utf-8')
                item['positionName'] = each["positionName"].encode('utf-8')
                item['salary'] = each["salary"].encode('utf-8')
                item['education'] = each["education"].encode('utf-8')
                item['workyear'] = each["workYear"].encode('utf-8')
                print "-" * 30
                yield item
Ejemplo n.º 21
0
    def parse_position(self, response):
        post_headers = self.headers.copy()

        post_headers.update({
            'Origin': 'https://www.lagou.com',
            'Referer': 'https://www.lagou.com/jobs/list_'
        })
        try:
            jdict = json.loads(response.body)
            print(jdict)
        except json.decoder.JSONDecodeError:
            yield scrapy.http.FormRequest(url='{post_url}&city={city}'.format(
                post_url=self.post_url, city=self.city),
                                          headers=post_headers,
                                          formdata={
                                              'pn': str(self.curpage),
                                              'first': 'true',
                                              'kd': self.key
                                          },
                                          dont_filter=True,
                                          callback=self.parse_position)

        if not jdict.get('success', None):
            print('Spiders are identified')
            yield scrapy.http.FormRequest(url='{post_url}&city={city}'.format(
                post_url=self.post_url, city=self.city),
                                          headers=post_headers,
                                          formdata={
                                              'pn': str(self.curpage),
                                              'first': 'true',
                                              'kd': self.key
                                          },
                                          dont_filter=True,
                                          callback=self.parse_position)
        else:
            item = LagouItem()
            jcontent = jdict['content']
            jposresult = jcontent["positionResult"]
            positions = jposresult["result"]
            for _position in positions:
                item['city'] = _position['city']  # 城市
                item['position_name'] = _position['positionName']  # 职位名称
                item['business_zones'] = _position['businessZones'] or [
                    ''
                ]  # 工作区域
                item['company_full_name'] = _position[
                    'companyFullName']  # 公司全称
                item['company_short_name'] = _position[
                    'companyShortName']  # 公司简称
                item['company_lable_list'] = _position['companyLabelList'] or [
                    ''
                ]  # 公司福利
                item['company_size'] = _position['companySize']  # 公司规模
                item['education'] = _position['education']  # 学历要求
                item['finance_stage'] = _position[
                    'financeStage'] or None  # 融资状况
                item['first_type'] = self.key_list[self.direction_index][
                    'first_type']  # 一级分类
                item['industry_field'] = _position[
                    'industryField'] or None  # 公司领域
                item['job_nature'] = _position['jobNature'] or None  # 工作性质
                item['position_lables'] = _position['positionLables'] or [
                    ''
                ]  # 职位标签
                item['salary'] = _position['salary']  # 薪资范围
                temp_list = _position['salary'].split('-')
                if len(temp_list) == 1:
                    item['salary_max'] = int(
                        temp_list[0][:temp_list[0].find('k')])
                else:
                    item['salary_max'] = int(
                        temp_list[1][:temp_list[1].find('k')])
                item['salary_min'] = int(temp_list[0][:temp_list[0].find('k')])
                item['salary_avg'] = (item['salary_max'] +
                                      item['salary_min']) / 2
                item['second_type'] = self.key
                item['work_year'] = _position['workYear']
                yield item
            if not self.total_page_count:
                self.total_page_count = jposresult['totalCount'] // 15 + 1
                print(self.total_page_count)
                # self.total_page_count = 30 if self.total_page_count > 30 else self.total_page_count
            if self.curpage < self.total_page_count - 1:
                self.curpage += 1
                print('Turn pages to ', self.curpage)
                yield scrapy.http.FormRequest(
                    url='{post_url}&city={city}'.format(post_url=self.post_url,
                                                        city=self.city),
                    headers=post_headers,
                    formdata={
                        'pn': str(self.curpage),
                        'first': 'true',
                        'kd': self.key
                    },
                    callback=self.parse_position)
            elif self.city_index < len(self.city_list) - 1:
                self.city_index += 1
                self.curpage = 1
                self.total_page_count = None
                self.key = self.key_words[self.key_index]
                self.city = self.city_list[self.city_index]
                print('Change the city to ', self.city)
                yield scrapy.http.FormRequest(url=self.post_url +
                                              '&city={}'.format(self.city),
                                              headers=post_headers,
                                              formdata={
                                                  'pn': str(self.curpage),
                                                  'first': 'true',
                                                  'kd': self.key
                                              },
                                              callback=self.parse_position)
            elif self.key_index < len(self.key_words) - 1:
                self.curpage = 1
                self.key_index += 1
                self.city_index = 0
                self.total_page_count = None
                self.city = self.city_list[self.city_index]
                self.key = self.key_words[self.key_index]
                print('Change the keyword to ', self.key)
                yield scrapy.http.FormRequest(
                    url='{post_url}&city={city}'.format(post_url=self.post_url,
                                                        city=self.city),
                    headers=post_headers,
                    formdata={
                        'pn': str(self.curpage),
                        'first': 'true',
                        'kd': self.key
                    },
                    callback=self.parse_position)
            else:
                self.direction_index += 1
                self.curpage = 1
                self.city_index = 0
                self.key_index = 0
                self.total_page_count = None
                self.key_words = self.key_list[
                    self.direction_index]['second_types']
                self.city = self.city_list[self.city_index]
                self.key = self.key_words[self.key_index]
                yield scrapy.http.FormRequest(
                    url='{post_url}&city={city}'.format(post_url=self.post_url,
                                                        city=self.city),
                    headers=post_headers,
                    formdata={
                        'pn': str(self.curpage),
                        'first': 'true',
                        'kd': self.key
                    },
                    callback=self.parse_position)