def parse_detail_utils_zhaopin(self, response, value):

    contain_key_word = response.xpath(
        "//div[@class='main1 cl main1-stat']//h1/text()").extract_first()
    m = re.search(value, contain_key_word, re.IGNORECASE)
    if m:
        itemloader = Job51ItemLoader(item=Job51Item(), response=response)
        itemloader.add_value("url", response.url)
        itemloader.add_value("url_obj_id", get_md5(response.url))
        itemloader.add_value("title", contain_key_word)
        str_salary = response.xpath(
            "//div[@class='l info-money']/strong/text()").extract_first("")
        if '元/月' in str_salary:
            list_str = str_salary.split("-")
            salary_min = float(list_str[0])
            salary_max = float(list_str[1].strip().split("元")[0].strip())
            itemloader.add_value("salary_min", salary_min)
            itemloader.add_value("salary_max", salary_max)
        elif '面议' in str_salary:
            salary_min = 0.0
            salary_max = 0.0
            itemloader.add_value("salary_min", salary_min)
            itemloader.add_value("salary_max", salary_max)
        job_city = response.xpath(
            "//div[@class='info-three l']/span/a/text()").extract_first("")
        itemloader.add_value("job_city", job_city)
        experience_year = response.xpath(
            "//div[@class='info-three l']/span[2]/text()").extract_first("")
        itemloader.add_value("experience_year", experience_year)
        education_need = response.xpath(
            "//div[@class='info-three l']/span[3]/text()").extract_first("")
        itemloader.add_value("education_need", education_need)
        itemloader.add_value("publish_date", datetime.now())
        job_advantage_tags_list = response.xpath(
            "//div[@class='welfare']//ul//li/text()").extract()
        if len(job_advantage_tags_list) == 0:
            job_advantage_tags = " "
        else:
            job_advantage_tags = ','.join(job_advantage_tags_list)
        position_info_contains_job_request_list = response.xpath(
            "//div[@class='responsibility pos-common']//text()").extract()
        if len(position_info_contains_job_request_list) == 0:
            position_info_contains_job_request = " "
        else:
            position_info_contains_job_request = ','.join(
                position_info_contains_job_request_list)
        itemloader.add_value("job_advantage_tags", job_advantage_tags)
        itemloader.add_value("position_info",
                             position_info_contains_job_request)
        itemloader.add_value("job_classification", "未分类")
        itemloader.add_value("crawl_time", datetime.now())
        item = itemloader.load_item()
        return item
Example #2
0
    def parse_detail(self, response):
        if response.status == 200:
            value = 'java'
            dict_obj = json.loads(response.text)
            if dict_obj.get("data", ''):
                code = dict_obj['code']
                numFound = dict_obj['data']['numFound']
                # print(type((json.loads(response.text))['data']['numFound']))
                # print(type(response.meta['meta_data']))
                list_len = len(dict_obj['data']['results'])
                if code == 200 and numFound > 0:
                    for i in range(list_len):
                        contain_key_word = dict_obj['data']['results'][i][
                            'jobName']
                        m = re.search(value, contain_key_word, re.IGNORECASE)
                        if m:
                            itemloader = Job51ItemLoader(item=Job51Item(),
                                                         response=response)
                            itemloader.add_value(
                                "url",
                                dict_obj['data']['results'][i]['positionURL'])
                            itemloader.add_value(
                                "url_obj_id",
                                get_md5(response.url) + str(int(time.time())))
                            itemloader.add_value("title", contain_key_word)
                            str_salary = dict_obj['data']['results'][i][
                                'salary']
                            if 'K' in str_salary:
                                try:
                                    list_str = str_salary.split("-")
                                    salary_min = float(list_str[0].strip(
                                    ).split("K")[0].strip()) * 1000
                                    salary_max = float(list_str[1].strip(
                                    ).split("K")[0].strip()) * 1000
                                    itemloader.add_value(
                                        "salary_min", salary_min)
                                    itemloader.add_value(
                                        "salary_max", salary_max)
                                except Exception as e:
                                    print('error str_salary', str_salary)
                                    print(e)

                            else:
                                print('str_salary error', str_salary)
                                itemloader.add_value("salary_min", 0)
                                itemloader.add_value("salary_max", 0)

                            job_city = dict_obj['data']['results'][i]['city'][
                                'display']

                            itemloader.add_value("job_city", job_city)
                            experience_year = dict_obj['data']['results'][i][
                                'workingExp']['name']
                            print('experience_year', experience_year)
                            itemloader.add_value("experience_year",
                                                 experience_year)
                            education_need = dict_obj['data']['results'][i][
                                'eduLevel']['name']
                            print('education_need', education_need)
                            itemloader.add_value("education_need",
                                                 education_need)

                            job_advantage_tags_list = dict_obj['data'][
                                'results'][i]['welfare']
                            if len(job_advantage_tags_list) == 0:
                                job_advantage_tags = " "
                            else:
                                job_advantage_tags = ','.join(
                                    job_advantage_tags_list)
                            position_info_contains_job_request_list = self.get_position_info_contains_job_request_list(
                                dict_obj['data']['results'][i]['positionURL'])

                            if len(position_info_contains_job_request_list
                                   ) == 0:
                                print('error response url', response.url)
                                print('error item 第几个', i)
                                position_info_contains_job_request = " "
                            else:
                                position_info_contains_job_request = ','.join(
                                    position_info_contains_job_request_list)
                            itemloader.add_value("job_advantage_tags",
                                                 job_advantage_tags)
                            itemloader.add_value(
                                "position_info",
                                position_info_contains_job_request)
                            itemloader.add_value(
                                "job_classification", dict_obj['data']
                                ['results'][i]['jobType']['display'])
                            itemloader.add_value("crawl_time", datetime.now())
                            publish_date = dict_obj['data']['results'][i][
                                'createDate'].strip().split(
                                    " ")[0].strip() + ""
                            itemloader.add_value("publish_date", publish_date)
                            item = itemloader.load_item()
                            yield item
        if response.status == 200 and response.meta.get(
                'meta_data',
                '') < (json.loads(response.text))['data']['numFound'] - 60:
            meta_data = 60 + response.meta.get('meta_data', '')
            print('start', meta_data)
            page = response.meta.get('page', '') + 1
            print('current_page', page)
            dic_page = {"p": 1, "jl": "489", "kw": "java", "kt": "3"}
            dic_page['p'] = page
            data = '{0}'.format(dic_page)
            print(data)
            from urllib import parse
            url_data = parse.quote(string=data, encoding="utf-8")
            url_next = 'https://fe-api.zhaopin.com/c/i/sou?start={start}&pageSize=60&cityId=489&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=java&kt=3&lastUrlQuery={lastUrlQuery}'.format(
                start=meta_data, lastUrlQuery=url_data)
            yield Request(url=url_next,
                          callback=self.parse_detail,
                          meta={
                              'meta_data': meta_data,
                              'page': page
                          })
Example #3
0
    def parse_job(self, response):
        global global_result
        if response.status == 302:
            print("302")
            print(response.url)
            try:
                time.sleep(1)
                src = response.xpath(
                    "//img[@id='captcha']/@src").extract_first("")
                if src:
                    print('src:', src)
                    img_src = "https://www.lagou.com" + src
                    try:
                        image = Image.open(
                            BytesIO((requests.get(img_src)).content))
                        image.save('verify2.gif')
                        rcf = RClientFour(self.ruokuai_username,
                                          self.ruokuai_passwd)
                        image = open('verify2.gif', 'rb').read()
                        global_result = rcf.rk_create_code(image,
                                                           3040).get('Result')
                        print('result:', global_result)
                    except IOError:
                        print('*****检查自己的快豆是不是没了****')
                        pass
                    # time.sleep(100000)
                    browser = webdriver.Chrome(
                        executable_path="/home/wqh/下载/chromedriver")
                    browser.find_element_by_xpath("//*[@id='code']").send_keys(
                        global_result)
                    browser.find_element_by_xpath("//a[@id='submit']").click()
                    return
            except Exception as e:
                print(e)
                print('不是验证页面')
                pass

        title = response.xpath(
            "/html/body/div[2]/div/div[1]/div/span").extract_first("")
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        list_type = []
        flag = False
        m = re.search("java", title, re.IGNORECASE)
        if m:
            flag = True
            list_type.append("java")

        if re.search("python", title, re.IGNORECASE):
            flag = True
            list_type.append("python")
        if re.search("人工智能", title, re.IGNORECASE):
            flag = True
            list_type.append("人工智能")
        if re.search("算法", title, re.IGNORECASE):
            flag = True
            list_type.append("算法")
        if re.search("大数据", title, re.IGNORECASE):
            flag = True
            list_type.append("大数据")
        if re.search("C\+\+", title, re.IGNORECASE):
            flag = True
            list_type.append("C++")
        if re.search("go", title, re.IGNORECASE):
            flag = True
            list_type.append("go")
        if flag:
            # 解析拉勾网的职位
            item_loader.add_value("type", list_type)
            item_loader.add_css("title", ".job-name::attr(title)")
            item_loader.add_value("url", response.url)
            item_loader.add_value(
                "url_obj_id",
                get_md5(response.url) + str(int(time.time())))
            str_salary = response.xpath(
                "//span[@class='salary']/text()").extract_first("")
            if 'k' in str_salary:
                try:
                    list_str = str_salary.split("-")
                    salary_min = float(
                        list_str[0].strip().split("k")[0].strip()) * 1000
                    salary_max = float(
                        list_str[1].strip().split("k")[0].strip()) * 1000
                    item_loader.add_value("salary_min", salary_min)
                    item_loader.add_value("salary_max", salary_max)
                except Exception as e:
                    print('error str_salary', str_salary)
                    print(e)

            else:
                print('str_salary error', str_salary)
                item_loader.add_value("salary_min", 0)
                item_loader.add_value("salary_max", 0)
            # item_loader.add_css("salary", ".job_request .salary::text")
            item_loader.add_xpath(
                "job_city", "//*[@class='job_request']/p/span[2]/text()")
            item_loader.add_xpath(
                "experience_year",
                "//*[@class='job_request']/p/span[3]/text()")
            item_loader.add_xpath(
                "education_need", "//*[@class='job_request']/p/span[4]/text()")
            item_loader.add_xpath(
                "job_type", "//*[@class='job_request']/p/span[5]/text()")
            item_loader.add_value("job_classification", title)

            item_loader.add_css("publish_date", ".publish_time::text")
            item_loader.add_css("job_advantage_tags", ".job-advantage p::text")
            item_loader.add_css("position_info", ".job_bt div")
            item_loader.add_css("job_addr", ".work_addr")
            item_loader.add_css("company_name",
                                "#job_company dt a img::attr(alt)")
            item_loader.add_css("company_url", "#job_company dt a::attr(href)")
            item_loader.add_value("crawl_time", datetime.now())

            job_item = item_loader.load_item()

            return job_item
Example #4
0
    def parse_job(self, response):
        title = response.xpath("/html/body/div[2]/div/div[1]/div/span").extract_first("")
        item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)
        list_type = []
        flag = False
        m = re.search("java", title, re.IGNORECASE)
        if m:
            flag = True
            list_type.append("java")

        if re.search("python", title, re.IGNORECASE):
            flag = True
            list_type.append("python")
        if re.search("人工智能", title, re.IGNORECASE):
            flag = True
            list_type.append("人工智能")
        if re.search("算法", title, re.IGNORECASE):
            flag = True
            list_type.append("算法")
        if re.search("大数据", title, re.IGNORECASE):
            flag = True
            list_type.append("大数据")
        if re.search("C\+\+", title, re.IGNORECASE):
            flag = True
            list_type.append("C++")
        if re.search("go", title, re.IGNORECASE):
            flag = True
            list_type.append("go")
        if flag:
            #解析拉勾网的职位
            item_loader.add_value("type", list_type)
            item_loader.add_css("title", ".job-name::attr(title)")
            item_loader.add_value("url", response.url)
            item_loader.add_value("url_obj_id", get_md5(response.url)+str(int(time.time())))
            str_salary = response.xpath("//span[@class='salary']/text()").extract_first("")
            if 'k' in str_salary:
                try:
                    list_str = str_salary.split("-")
                    salary_min = float(list_str[0].strip().split("k")[0].strip()) * 1000
                    salary_max = float(list_str[1].strip().split("k")[0].strip()) * 1000
                    item_loader.add_value("salary_min", salary_min)
                    item_loader.add_value("salary_max", salary_max)
                except Exception as e:
                    print('error str_salary', str_salary)
                    print(e)

            else:
                print('str_salary error', str_salary)
                item_loader.add_value("salary_min", 0)
                item_loader.add_value("salary_max", 0)
            # item_loader.add_css("salary", ".job_request .salary::text")
            item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()")
            item_loader.add_xpath("experience_year", "//*[@class='job_request']/p/span[3]/text()")
            item_loader.add_xpath("education_need", "//*[@class='job_request']/p/span[4]/text()")
            item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()")
            try:
                item_loader.add_css("job_classification", '.position-label li::text')
            except Exception as e:
                print("job_classification error")
                print(e)
                item_loader.add_value("job_classification", '.job-name::attr(title)')
            item_loader.add_css("publish_date", ".publish_time::text")
            item_loader.add_css("job_advantage_tags", ".job-advantage p::text")
            item_loader.add_css("position_info", ".job_bt div")
            item_loader.add_css("job_addr", ".work_addr")
            item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
            item_loader.add_css("company_url", "#job_company dt a::attr(href)")
            item_loader.add_value("crawl_time", datetime.now())

            job_item = item_loader.load_item()

            return job_item
Example #5
0
def parse_detail_utils(self, response, value):
    contain_key_word = response.xpath(
        "//div[@class='tHeader tHjob']//h1/text()").extract_first().strip()
    m = re.search(value, contain_key_word, re.IGNORECASE)
    if m:
        itemloader = Job51ItemLoader(item=Job51Item(), response=response)
        itemloader.add_value("url", response.url)
        itemloader.add_value("url_obj_id",
                             get_md5(response.url) + str(int(time.time())))
        itemloader.add_value("title", contain_key_word)
        try:
            if response.xpath(
                    "/html/body/div[3]/div[2]/div[2]/div/div[1]/strong//text()"
            ).extract_first("") != "":
                str_salary = response.xpath(
                    "/html/body/div[3]/div[2]/div[2]/div/div[1]/strong//text()"
                ).extract_first("")
                if '千/月' in str_salary:
                    list_str = str_salary.split("-")
                    print(list_str[0])
                    print(list_str[1].strip().split("千")[0].strip())
                    salary_min = float(list_str[0]) * 1000
                    salary_max = float(
                        list_str[1].strip().split("千")[0].strip()) * 1000
                    itemloader.add_value("salary_min", salary_min)
                    itemloader.add_value("salary_max", salary_max)
                elif '万/月' in str_salary:
                    list_str = str_salary.strip().split("-")
                    print(list_str[0])
                    print(list_str[1].strip().split("万")[0].strip())
                    salary_min = float(list_str[0]) * 10000
                    salary_max = float(
                        list_str[1].strip().split("万")[0].strip()) * 10000
                    itemloader.add_value("salary_min", salary_min)
                    itemloader.add_value("salary_max", salary_max)
                elif '万/年' in str_salary:
                    list_str = str_salary.strip().split("-")
                    salary_min = float(list_str[0]) * 10000 / 12
                    salary_max = float(
                        list_str[1].strip().split("万")[0].strip()) * 10000 / 12
                    itemloader.add_value("salary_min", salary_min)
                    itemloader.add_value("salary_max", salary_max)
                else:
                    itemloader.add_value("salary_min", 0)
                    itemloader.add_value("salary_max", 0)
            else:
                itemloader.add_value("salary_min", 0)
                itemloader.add_value("salary_max", 0)
        except Exception as e:
            print("str_salary error")
            print(e)
            itemloader.add_value("salary_min", 0)
            itemloader.add_value("salary_max", 0)
        info = response.xpath("//p[@class='msg ltype']/@title").extract_first()
        job_city = info.strip().split("|")[0].strip()
        experience_year = find_in_list(self, key="经验", list_name=info)

        itemloader.add_value("job_city", job_city)
        itemloader.add_value("experience_year", experience_year)
        try:
            education_need = info.strip().split("|")[2].strip()
            print(education_need)
            if '人' in education_need:
                education_need = "无"
            itemloader.add_value("education_need", education_need)
        except Exception as e:
            print("education_need error null")
            print(e)

        publish_date = find_in_list(self, key="发布", list_name=info)
        itemloader.add_value("publish_date", publish_date)
        job_advantage_tags_list = response.xpath(
            "//div[@class='t1']//span/text()").extract()
        if len(job_advantage_tags_list) == 0:
            job_advantage_tags = " "
        else:
            job_advantage_tags = ','.join(job_advantage_tags_list)
        position_info_contains_job_request_list = response.xpath(
            "//div[@class='bmsg job_msg inbox']/p//text()").extract()
        if len(position_info_contains_job_request_list) == 0:
            position_info_contains_job_request = " "
        else:
            position_info_contains_job_request = ','.join(
                position_info_contains_job_request_list)
        itemloader.add_value("job_advantage_tags", job_advantage_tags)
        itemloader.add_value("position_info",
                             position_info_contains_job_request)
        job_classification = response.xpath(
            "//div[@class='tCompany_main']//div[@class='mt10']/p[1]//a/text()"
        ).extract_first("")
        itemloader.add_value("job_classification", job_classification)
        itemloader.add_value("crawl_time", datetime.now())
        item = itemloader.load_item()
        return item