def parse_detail_utils_zhaopin(self, response, value):

    contain_key_word = response.xpath(
        "//div[@class='main1 cl main1-stat']//h1/text()").extract_first()
    m = re.search(value, contain_key_word, re.IGNORECASE)
    if m:
        itemloader = Job51ItemLoader(item=Job51Item(), response=response)
        itemloader.add_value("url", response.url)
        itemloader.add_value("url_obj_id", get_md5(response.url))
        itemloader.add_value("title", contain_key_word)
        str_salary = response.xpath(
            "//div[@class='l info-money']/strong/text()").extract_first("")
        if '元/月' in str_salary:
            list_str = str_salary.split("-")
            salary_min = float(list_str[0])
            salary_max = float(list_str[1].strip().split("元")[0].strip())
            itemloader.add_value("salary_min", salary_min)
            itemloader.add_value("salary_max", salary_max)
        elif '面议' in str_salary:
            salary_min = 0.0
            salary_max = 0.0
            itemloader.add_value("salary_min", salary_min)
            itemloader.add_value("salary_max", salary_max)
        job_city = response.xpath(
            "//div[@class='info-three l']/span/a/text()").extract_first("")
        itemloader.add_value("job_city", job_city)
        experience_year = response.xpath(
            "//div[@class='info-three l']/span[2]/text()").extract_first("")
        itemloader.add_value("experience_year", experience_year)
        education_need = response.xpath(
            "//div[@class='info-three l']/span[3]/text()").extract_first("")
        itemloader.add_value("education_need", education_need)
        itemloader.add_value("publish_date", datetime.now())
        job_advantage_tags_list = response.xpath(
            "//div[@class='welfare']//ul//li/text()").extract()
        if len(job_advantage_tags_list) == 0:
            job_advantage_tags = " "
        else:
            job_advantage_tags = ','.join(job_advantage_tags_list)
        position_info_contains_job_request_list = response.xpath(
            "//div[@class='responsibility pos-common']//text()").extract()
        if len(position_info_contains_job_request_list) == 0:
            position_info_contains_job_request = " "
        else:
            position_info_contains_job_request = ','.join(
                position_info_contains_job_request_list)
        itemloader.add_value("job_advantage_tags", job_advantage_tags)
        itemloader.add_value("position_info",
                             position_info_contains_job_request)
        itemloader.add_value("job_classification", "未分类")
        itemloader.add_value("crawl_time", datetime.now())
        item = itemloader.load_item()
        return item
Beispiel #2
0
    def parse_detail(self, response):
        if response.status == 200:
            value = 'java'
            dict_obj = json.loads(response.text)
            if dict_obj.get("data", ''):
                code = dict_obj['code']
                numFound = dict_obj['data']['numFound']
                # print(type((json.loads(response.text))['data']['numFound']))
                # print(type(response.meta['meta_data']))
                list_len = len(dict_obj['data']['results'])
                if code == 200 and numFound > 0:
                    for i in range(list_len):
                        contain_key_word = dict_obj['data']['results'][i][
                            'jobName']
                        m = re.search(value, contain_key_word, re.IGNORECASE)
                        if m:
                            itemloader = Job51ItemLoader(item=Job51Item(),
                                                         response=response)
                            itemloader.add_value(
                                "url",
                                dict_obj['data']['results'][i]['positionURL'])
                            itemloader.add_value(
                                "url_obj_id",
                                get_md5(response.url) + str(int(time.time())))
                            itemloader.add_value("title", contain_key_word)
                            str_salary = dict_obj['data']['results'][i][
                                'salary']
                            if 'K' in str_salary:
                                try:
                                    list_str = str_salary.split("-")
                                    salary_min = float(list_str[0].strip(
                                    ).split("K")[0].strip()) * 1000
                                    salary_max = float(list_str[1].strip(
                                    ).split("K")[0].strip()) * 1000
                                    itemloader.add_value(
                                        "salary_min", salary_min)
                                    itemloader.add_value(
                                        "salary_max", salary_max)
                                except Exception as e:
                                    print('error str_salary', str_salary)
                                    print(e)

                            else:
                                print('str_salary error', str_salary)
                                itemloader.add_value("salary_min", 0)
                                itemloader.add_value("salary_max", 0)

                            job_city = dict_obj['data']['results'][i]['city'][
                                'display']

                            itemloader.add_value("job_city", job_city)
                            experience_year = dict_obj['data']['results'][i][
                                'workingExp']['name']
                            print('experience_year', experience_year)
                            itemloader.add_value("experience_year",
                                                 experience_year)
                            education_need = dict_obj['data']['results'][i][
                                'eduLevel']['name']
                            print('education_need', education_need)
                            itemloader.add_value("education_need",
                                                 education_need)

                            job_advantage_tags_list = dict_obj['data'][
                                'results'][i]['welfare']
                            if len(job_advantage_tags_list) == 0:
                                job_advantage_tags = " "
                            else:
                                job_advantage_tags = ','.join(
                                    job_advantage_tags_list)
                            position_info_contains_job_request_list = self.get_position_info_contains_job_request_list(
                                dict_obj['data']['results'][i]['positionURL'])

                            if len(position_info_contains_job_request_list
                                   ) == 0:
                                print('error response url', response.url)
                                print('error item 第几个', i)
                                position_info_contains_job_request = " "
                            else:
                                position_info_contains_job_request = ','.join(
                                    position_info_contains_job_request_list)
                            itemloader.add_value("job_advantage_tags",
                                                 job_advantage_tags)
                            itemloader.add_value(
                                "position_info",
                                position_info_contains_job_request)
                            itemloader.add_value(
                                "job_classification", dict_obj['data']
                                ['results'][i]['jobType']['display'])
                            itemloader.add_value("crawl_time", datetime.now())
                            publish_date = dict_obj['data']['results'][i][
                                'createDate'].strip().split(
                                    " ")[0].strip() + ""
                            itemloader.add_value("publish_date", publish_date)
                            item = itemloader.load_item()
                            yield item
        if response.status == 200 and response.meta.get(
                'meta_data',
                '') < (json.loads(response.text))['data']['numFound'] - 60:
            meta_data = 60 + response.meta.get('meta_data', '')
            print('start', meta_data)
            page = response.meta.get('page', '') + 1
            print('current_page', page)
            dic_page = {"p": 1, "jl": "489", "kw": "java", "kt": "3"}
            dic_page['p'] = page
            data = '{0}'.format(dic_page)
            print(data)
            from urllib import parse
            url_data = parse.quote(string=data, encoding="utf-8")
            url_next = 'https://fe-api.zhaopin.com/c/i/sou?start={start}&pageSize=60&cityId=489&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=java&kt=3&lastUrlQuery={lastUrlQuery}'.format(
                start=meta_data, lastUrlQuery=url_data)
            yield Request(url=url_next,
                          callback=self.parse_detail,
                          meta={
                              'meta_data': meta_data,
                              'page': page
                          })
Beispiel #3
0
def parse_detail_utils(self, response, value):
    contain_key_word = response.xpath("//div[@class='tHeader tHjob']//h1/text()").extract_first().strip()
    m = re.search(value, contain_key_word, re.IGNORECASE)
    if m:
        itemloader = Job51ItemLoader(item=Job51Item(), response=response)
        itemloader.add_value("url", response.url)
        itemloader.add_value("url_obj_id", get_md5(response.url)+str(int(time.time())))
        itemloader.add_value("title", contain_key_word)
        try:
            if response.xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/strong//text()").extract_first("") != "":
                str_salary = response.xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/strong//text()").extract_first(
                    "")
                if '千/月' in str_salary:
                    list_str = str_salary.split("-")
                    print(list_str[0])
                    print(list_str[1].strip().split("千")[0].strip())
                    salary_min = float(list_str[0]) * 1000
                    salary_max = float(list_str[1].strip().split("千")[0].strip()) * 1000
                    itemloader.add_value("salary_min", salary_min)
                    itemloader.add_value("salary_max", salary_max)
                elif '万/月' in str_salary:
                    list_str = str_salary.strip().split("-")
                    print(list_str[0])
                    print(list_str[1].strip().split("万")[0].strip())
                    salary_min = float(list_str[0]) * 10000
                    salary_max = float(list_str[1].strip().split("万")[0].strip()) * 10000
                    itemloader.add_value("salary_min", salary_min)
                    itemloader.add_value("salary_max", salary_max)
                elif '万/年' in str_salary:
                    list_str = str_salary.strip().split("-")
                    salary_min = float(list_str[0]) * 10000 / 12
                    salary_max = float(list_str[1].strip().split("万")[0].strip()) * 10000 / 12
                    itemloader.add_value("salary_min", salary_min)
                    itemloader.add_value("salary_max", salary_max)
                else:
                    itemloader.add_value("salary_min", 0)
                    itemloader.add_value("salary_max", 0)
            else:
                itemloader.add_value("salary_min", 0)
                itemloader.add_value("salary_max", 0)
        except Exception as e:
            print("str_salary error")
            print(e)
            itemloader.add_value("salary_min", 0)
            itemloader.add_value("salary_max", 0)
        info = response.xpath("//p[@class='msg ltype']/@title").extract_first()
        job_city = info.strip().split("|")[0].strip()
        experience_year = find_in_list(self, key="经验", list_name=info)

        itemloader.add_value("job_city", job_city)
        itemloader.add_value("experience_year", experience_year)
        try:
            education_need = info.strip().split("|")[2].strip()
            print(education_need)
            if '人' in education_need:
                education_need = "无"
            itemloader.add_value("education_need", education_need)
        except Exception as e:
            print("education_need error null")
            print(e)

        publish_date = find_in_list(self, key="发布", list_name=info)
        itemloader.add_value("publish_date", publish_date)
        job_advantage_tags_list = response.xpath("//div[@class='t1']//span/text()").extract()
        if len(job_advantage_tags_list) == 0:
            job_advantage_tags = " "
        else:
            job_advantage_tags = ','.join(job_advantage_tags_list)
        position_info_contains_job_request_list = response.xpath(
            "//div[@class='bmsg job_msg inbox']/p//text()").extract()
        if len(position_info_contains_job_request_list) == 0:
            position_info_contains_job_request = " "
        else:
            position_info_contains_job_request = ','.join(position_info_contains_job_request_list)
        itemloader.add_value("job_advantage_tags", job_advantage_tags)
        itemloader.add_value("position_info", position_info_contains_job_request)
        job_classification = response.xpath("//div[@class='tCompany_main']//div[@class='mt10']/p[1]//a/text()").extract_first("")
        itemloader.add_value("job_classification", job_classification)
        itemloader.add_value("crawl_time", datetime.now())
        item = itemloader.load_item()
        return item