Ejemplo n.º 1
0
 def parse(self, response):
     print("request -> " + response.url)
     job_list = response.css('ul.list-items > li')
     if (len(job_list) > 0):
         print("neitui Nums:" + str(len(job_list)))
         for job in job_list:
             item = WwwJobComItem()
             job_primary = job.css('div.positionleft > div')
             item['position_id'] = job_primary[0].css('a::attr(href)').extract_first().strip().replace("/j/", "")
             item["position_name"] = job_primary[0].css('a::text').extract_first().strip()
             item['time'] = job_primary[0].css('span::text').extract_first().strip()
             item["salary"] = job_primary[1].css('span.mr10::text').extract_first().strip().replace("k", "K")
             salary = item["salary"].split("-")
             item["avg_salary"] = (int(salary[0].replace("K", "")) + int(salary[1].replace("K", ""))) / 2
             info_primary = job_primary[1].css('span::text').extract()
             item['city'] = info_primary[5].strip()
             item['work_year'] = info_primary[1].strip()
             item['education'] = info_primary[3].strip()
             item['company_name'] = job_primary[2].css('span >a::text').extract_first().strip()
             item['finance_stage'] = job_primary[2].css('span::text').extract()[1].strip()
             item['industry_field'] = ""
             item['company_size'] = ""
             item['position_lables'] = ""
             item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
             item['platform'] = "neitui"
             yield item
         yield self.next_request()
Ejemplo n.º 2
0
    def parse(self, response):
        print("request -> " + response.url)
        job_list = response.css('div.dw_table > div.el')
        if (len(job_list) > 1):
            print("51job Nums:" + str(len(job_list)))
            for job in job_list:
                item = WwwJobComItem()
                str_time = job.css('span.t5::text').extract_first().strip()
                if (str_time == "发布时间"):
                    continue
                else:
                    item['position_id'] = job.css(
                        'p.t1 > input::attr(value)').extract_first().strip()
                    item["position_name"] = job.css(
                        'p.t1 > span > a::text').extract_first().strip()
                    salary = job.css('span.t4::text').extract_first().strip()
                    if (salary.find("万/月") > -1):
                        salary = salary.replace("万/月", "").split("-")
                        item["salary"] = str(
                            float(salary[0]) * 10) + "K-" + str(
                                float(salary[1]) * 10) + "K"
                        item["avg_salary"] = (float(salary[0]) * 10 +
                                              float(salary[1]) * 10) / 2
                    elif (salary.find("万/年") > -1):
                        salary = salary.replace("万/年", "").split("-")
                        item["salary"] = str(
                            float(salary[0]) / 12) + "K-" + str(
                                float(salary[1]) / 12) + "K"
                        item["avg_salary"] = (float(salary[0]) / 12 +
                                              float(salary[1]) / 12) / 2
                    elif (salary.find("元/天") > -1):
                        continue
                    else:
                        salary = salary.replace("千/月", "").split("-")
                        item["salary"] = salary[0] + "K-" + salary[1] + "K"
                        item["avg_salary"] = (float(salary[0]) +
                                              float(salary[1])) / 2
                    item['city'] = job.css(
                        'span.t3::text').extract_first().strip()
                    item['work_year'] = ""
                    item['education'] = ""
                    item['company_name'] = job.css(
                        'span.t2 > a::text').extract_first().strip()

                    item['industry_field'] = ""
                    item['finance_stage'] = ""
                    item['company_size'] = ""
                    item['position_lables'] = ""
                    item['time'] = str_time
                    item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                       time.localtime())
                    item['platform'] = "51job"
                    yield item
            yield self.next_request()
Ejemplo n.º 3
0
    def parse(self, response):
        print("request -> " + response.url)
        job_list = response.css('div.job-list > ul > li')
        if (len(job_list) > 0):
            print("zhipin Nums:" + str(len(job_list)))
            for job in job_list:
                item = WwwJobComItem()
                job_primary = job.css('div.job-primary')
                item['position_id'] = job.css(
                    'div.info-primary > h3 > a::attr(data-jobid)'
                ).extract_first().strip()
                item["position_name"] = job_primary.css(
                    'div.info-primary > h3 > a > div::text').extract_first(
                    ).strip()
                item["salary"] = job_primary.css(
                    'div.info-primary > h3 > a > span::text').extract_first()
                item["avg_salary"] = ''
                # if '·' in item["salary"]:
                #     salary_year = float(item["salary"].split("·")[1].replace("薪", ""))
                # else:
                #     salary_year = 12
                # salary = item["salary"].split("·")[0].split("-")
                # if len(salary) > 1:
                #     item["avg_salary"] = ((float(salary[0].replace("K", "")) +
                #                            float(salary[1].replace("K", ""))) / 2) * (salary_year / 12)
                # else:
                #     item["avg_salary"] = item["salary"]
                info_primary = job_primary.css(
                    'div.info-primary > p::text').extract()
                item['city'] = info_primary[0].strip()
                item['work_year'] = info_primary[1].strip()
                item['education'] = info_primary[2].strip()
                item['company_name'] = job_primary.css(
                    'div.info-company > div.company-text > h3 > a::text'
                ).extract_first().strip()
                company_infos = job_primary.css(
                    'div.info-company > div.company-text > p::text').extract()
                if len(company_infos) == 3:
                    item['industry_field'] = company_infos[0].strip()
                    item['finance_stage'] = company_infos[1].strip()
                    item['company_size'] = company_infos[2].strip()
                else:
                    item['industry_field'] = company_infos[0].strip()
                    item['finance_stage'] = ""
                    item['company_size'] = company_infos[1].strip()

                item[
                    'position_lables'] = ""  # job_primary.css('div.info-detail > div.tags > span::text').extract()
                item['time'] = ''  # job.css('div.info-publis > p::text')
                item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                   time.localtime())
                item['platform'] = "zhipin"
                yield item
            yield self.next_request()
Ejemplo n.º 4
0
    def parse(self, response):
        print("request -> " + response.url)
        try:
            html = json.loads(response.body.decode("utf-8"))
        except ValueError:
            print(response.body)
            yield self.next_request()

        if (html.get("result") == 0):
            print("dajie Num:" + str(html.get('data').get('total')))
            results = html.get('data').get('list')
            if len(results) > 0:
                for result in results:
                    item = WwwJobComItem()
                    item['salary'] = result.get('salary').replace(" ",
                                                                  "").replace(
                                                                      "/月", "")
                    if (item["salary"].find("-") > -1):
                        salary = item["salary"].split("-")
                        item["avg_salary"] = (int(salary[0].replace(
                            "K", "")) + int(salary[1].replace("K", ""))) / 2
                    else:
                        item["avg_salary"] = item["salary"].replace("K", "")
                    item['city'] = result.get('pubCity')
                    item['finance_stage'] = ""
                    item['industry_field'] = result.get('industryName')
                    item['position_lables'] = ""
                    item['position_id'] = result.get('jobseq')
                    item['company_size'] = result.get('scaleName')
                    item['position_name'] = result.get('jobName')
                    item['work_year'] = result.get('pubEx')
                    item['education'] = result.get('pubEdu')
                    item['company_name'] = result.get('compName')
                    item['time'] = result.get("time")
                    item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                       time.localtime())
                    item['platform'] = "dajie"
                    yield item
            totalPage = html.get('data').get("totalPage")
            self.curPage = self.curPage + 1
            if (self.curPage <= totalPage):
                self.url = 'https://so.dajie.com/job/ajax/search/filter?keyword=' + self.job_name + '&order=0&city=' + self.city_id + '&recruitType=&salary=&experience=&page=' + str(
                    self.curPage) + '&positionFunction=&_CSRFToken=&ajax=1'
                yield self.next_request()
        else:
            time.sleep(10)
            yield self.next_request()
Ejemplo n.º 5
0
    def parse(self, response):
        print("request -> " + response.url)
        try:
            html = json.loads(response.body)
        except ValueError:
            print(response.body)
            yield self.next_request()

        if (html.get("success")):
            if html.get('content').get('positionResult').get(
                    'resultSize') != 0:
                results = html.get('content').get('positionResult').get(
                    'result')
                print('lagou Nums:' + str(len(results)))
                for result in results:
                    item = WwwJobComItem()
                    item['salary'] = result.get('salary').replace("k", "K")
                    salary = item["salary"].split("-")
                    item["avg_salary"] = (int(salary[0].replace("K", "")) +
                                          int(salary[1].replace("K", ""))) / 2
                    item['city'] = result.get('city')
                    item['finance_stage'] = result.get('financeStage')
                    item['industry_field'] = result.get('industryField')
                    item['position_lables'] = result.get('positionAdvantage')
                    item['position_id'] = result.get('positionId')
                    item['company_size'] = result.get('companySize')
                    item['position_name'] = result.get('positionName')
                    item['work_year'] = result.get('workYear')
                    item['education'] = result.get('education')
                    item['company_name'] = result.get('companyShortName')
                    item['time'] = result.get("formatCreateTime")
                    item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                       time.localtime())
                    item['platform'] = "lagou"
                    yield item
                totalPage = math.floor(
                    int(
                        html.get('content').get('positionResult').get(
                            "totalCount")) /
                    int(html.get('content').get("pageSize")))
                self.curPage = self.curPage + 1
                if (self.curPage <= totalPage):
                    yield self.next_request()
        else:
            time.sleep(60)
            yield self.next_request()
Ejemplo n.º 6
0
    def parse(self, response):
        for box in response.xpath(
                '//ul[@class="item_con_list" and @style="display: block;"]/li'
        ):
            item = WwwJobComItem()

            item['position_id'] = box.xpath('./@data-positionid').extract()[0]
            item["position_name"] = box.xpath(
                './@data-positionname').extract()[0]
            item["salary"] = box.xpath('./@data-salary').extract()[0]
            item["avg_salary"] = ''
            item['city'] = box.xpath(
                './/span[@class="add"]/em/text()').extract()[0]

            tmp = box.xpath(
                './/div[@class="p_bot"]/div[@class="li_b_l"]/text()').extract(
                )[2].strip().split('/')
            item['work_year'] = tmp[0]
            item['education'] = tmp[1]
            item['company_name'] = box.xpath(
                './/div[@class="company_name"]/a/text()').extract()[0]

            tmp = box.xpath('.//div[@class="industry"]/text()').extract(
            )[0].strip().split('/')
            item['industry_field'] = tmp[0]
            item['finance_stage'] = tmp[1]
            item['company_size'] = tmp[2]
            item['position_lables'] = ""
            item['time'] = box.xpath(
                './/span[@class="format-time"]/text()').extract()[0]
            item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                               time.localtime())
            item['platform'] = "lagou"
            yield item

        self.totalPage = response.xpath(
            '//div[@class="page-number"]/span[@class="span totalNum"]/text()'
        ).extract()[0]
        self.curPage += 1
        if int(self.curPage) <= int(self.totalPage):
            print('next')
            yield scrapy.Request(url=self.url,
                                 callback=self.parse,
                                 dont_filter=True)
Ejemplo n.º 7
0
 def parse(self, response):
     print("request -> " + response.url)
     job_list = response.css('div.job-parttime > dl')
     if (len(job_list) > 0):
         print("ganji Nums:" + str(len(job_list)))
         for job in job_list:
             item = WwwJobComItem()
             item['position_id'] = job.css('dt > div > input::attr(value)'
                                           ).extract_first().strip().split(
                                               ",")[0]
             item["position_name"] = "php开发工程师"
             salary = job.css('em.unit::text').extract_first().strip()
             if (salary == "面议"):
                 item["salary"] = "面议"
                 item["avg_salary"] = 0
             else:
                 salary = job.css(
                     'dt > div > p > em.lipay > i > strong::text'
                 ).extract_first().strip().split("-")
                 item["salary"] = str(math.ceil(
                     int(salary[0]) / 1000)) + "K-" + str(
                         math.ceil(int(salary[1]) / 1000)) + "K"
                 item["avg_salary"] = (int(salary[0]) +
                                       int(salary[1])) / 2000
             item['city'] = job.css('dt > div > p.site > a::text'
                                    ).extract_first().strip().replace(
                                        "地址:", "")
             item['work_year'] = job.css('dt > div > p > em.liexp::text'
                                         ).extract_first().strip().replace(
                                             "经验:", "")
             item['education'] = ""
             item['company_name'] = job.css(
                 'div.j-comp > a::text').extract_first().strip()
             item['industry_field'] = ""
             item['finance_stage'] = ""
             item['company_size'] = ""
             item['position_lables'] = ""
             item['time'] = job.css('p.time::text').extract_first().strip()
             item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                time.localtime())
             item['platform'] = "ganji"
             yield item
         yield self.next_request()
Ejemplo n.º 8
0
    def parse(self, response):
        print("request -> " + response.url)
        job_list = response.css('li.job_item')
        if (len(job_list) > 0):
            print("job58 Nums:" + str(len(job_list)))
            for job in job_list:
                item = WwwJobComItem()
                item['time'] = job.css('span.sign::text').extract_first().strip()
                if (item['time'] == "优选" or item['time'] == "精准"):
                    continue
                else:
                    item['position_id'] = job.css('div.job_name > a::attr(urlparams)').extract_first().strip().replace(
                        "psid=", "").replace("&entinfo=", "").replace("_p", "").replace("_j", "")
                    item[
                        "position_name"] = job.css('div.job_comp > p.job_require >span::text').extract()[
                        0].strip()
                    salary = job.css('p.job_salary::text').extract_first().strip()
                    if (salary == "面议"):
                        new_salary = salary
                        item["avg_salary"] = 0
                    elif (salary == "1000"):
                        new_salary = "1K"
                        item["avg_salary"] = 1.0
                    else:
                        salary = salary.split("-")
                        new_salary = str(math.ceil(int(salary[0]) / 1000)) + "K-" + str(
                            math.ceil(int(salary[1]) / 1000)) + "K"
                        item["avg_salary"] = (int(salary[0]) + int(salary[1])) / 2000
                    item["salary"] = new_salary
                    item['city'] = "郑州"
                    item['work_year'] = job.css("div.job_comp > p.job_require > span::text").extract()[2].strip()
                    item['education'] = job.css("div.job_comp > p.job_require > span::text").extract()[1].strip()
                    item['company_name'] = job.css('div.comp_name > a::text').extract_first().strip()

                    item['industry_field'] = ""
                    item['finance_stage'] = ""
                    item['company_size'] = ""
                    label = job.css("div.job_wel > span::text").extract()
                    item['position_lables'] = ",".join(label)
                    item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    item['platform'] = "job58"
                    yield item
            yield self.next_request()
Ejemplo n.º 9
0
    def parse(self, response):
        print("request -> " + response.url)

        try:
            html = json.loads(response.body)
        except ValueError:
            print(response.body)
            yield self.next_request()

        if 'data' in html.keys():
            if 'results' in html['data'].keys():
                results = html.get('data').get('results')
                print('zhilian Nums:' + str(len(results)))
                for result in results:
                    item = WwwJobComItem()
                    item['salary'] = result.get('salary').replace("k", "K")
                    item["avg_salary"] = ''
                    # salary = item["salary"].split("-")
                    # if len(salary) > 1:
                    #     item["avg_salary"] = (float(salary[0].replace("K", "")) + float(salary[1].replace("K", ""))) / 2
                    # else:
                    #     item["avg_salary"] = item["salary"]
                    item['city'] = result.get('city').get("display")
                    item['finance_stage'] = ''
                    item['industry_field'] = ''
                    item['position_lables'] = result.get('jobType').get(
                        'items')[0].get('name')
                    item['position_id'] = result.get('number')
                    item['company_size'] = result.get('company').get(
                        'size').get('name')
                    item['position_name'] = result.get('jobName')
                    item['work_year'] = result.get('workingExp').get('name')
                    item['education'] = result.get('eduLevel').get('name')
                    item['company_name'] = result.get('company').get('name')
                    item['time'] = result.get("updateDate")
                    item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                       time.localtime())
                    item['platform'] = "zhilianzhaopin"
                    yield item

                yield self.next_request()
Ejemplo n.º 10
0
 def parse(self, response):
     print("request -> " + response.url)
     job_list = response.css('table.newlist > tr')
     if (len(job_list) > 1):
         print("zhaopin Nums:" + str(len(job_list)))
         i = 0
         for job in job_list:
             i += 1
             if (i > 1 and (i % 2) == 0):
                 item = WwwJobComItem()
                 item['position_id'] = job.css(
                     'td.zwmc > input::attr(data-monitor)').extract_first(
                     ).strip().replace("|", "")
                 name = job.css('td.zwmc > div > a').extract_first().strip()
                 if (name.find("php") > -1 or name.find("Php") > -1
                         or name.find("PHP") > -1):
                     item["position_name"] = "php研发工程师"
                     salary = job.css(
                         'td.zwyx::text').extract_first().strip().split("-")
                     item["salary"] = str(int(
                         int(salary[0]) / 1000)) + "K-" + str(
                             int(int(salary[1]) / 100)) + "K"
                     item["avg_salary"] = (int(salary[0]) +
                                           int(salary[1])) / 2000
                     item['city'] = "郑州"
                     item['work_year'] = ""
                     item['education'] = ""
                     item['company_name'] = job.css(
                         'td.gsmc > a::text').extract_first().strip()
                     item['industry_field'] = ""
                     item['finance_stage'] = ""
                     item['company_size'] = ""
                     item['position_lables'] = ""
                     item['time'] = job.css(
                         'td.gxsj > span::text').extract_first().strip()
                     item['updated_at'] = time.strftime(
                         "%Y-%m-%d %H:%M:%S", time.localtime())
                     item['platform'] = "zhaopin"
                     yield item
         yield self.next_request()
Ejemplo n.º 11
0
    def parse(self, response):
        print("request -> " + response.url)
        job_list = response.css('div.jobList > ul')
        if (len(job_list) > 0):
            print("chinahr Nums:" + str(len(job_list)))
            for job in job_list:
                item = WwwJobComItem()
                item['position_id'] = job.css(
                    'li.l1 > span.e1 > a::attr(href)').extract_first().strip(
                    ).replace(".html?searchplace=" + CITY_DICT[CITY],
                              "").replace("http://www.chinahr.com/job/", "")
                item["position_name"] = job.css(
                    'li.l1 > span.e1 > a::text').extract_first().strip()
                item["salary"] = job.css(
                    'li.l2 > span.e2::text').extract_first()
                item["avg_salary"] = ''
                # salary = job.css('li.l2 > span.e2::text').extract_first().strip().split("-")
                # item["salary"] = str(int(int(salary[0]) / 1000)) + "K-" + str(int(int(salary[1]) / 1000)) + "K"
                # item["avg_salary"] = (int(salary[0]) + int(salary[1])) / 2000
                info_primary = job.css(
                    'li.l2 > span.e1::text').extract_first().strip().split("/")
                item['city'] = info_primary[0] + info_primary[1]
                item['work_year'] = info_primary[2].replace(
                    "]\r\n\t\t\t\t\t\t\t", "")
                item['education'] = info_primary[3]
                item['company_name'] = job.css(
                    'li.l1 > span.e3 > a::text').extract_first().strip()

                item['industry_field'] = ""
                item['finance_stage'] = ""
                item['company_size'] = ""

                item['position_lables'] = ""
                item['time'] = job.css(
                    'li.l1 > span.e2::text').extract_first().strip()
                item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                   time.localtime())
                item['platform'] = "chinahr"
                yield item
            yield self.next_request()