def parse_company_info(root, item): list_items = root.xpath('.') # item['company_name'] = try_get_value_from_array(list_items.xpath( # '//div[@class="title-info "]/h3/a/text()').extract()) item['company_trade'] = try_get_value_from_array( list_items.xpath( u".//*[contains(text(), '行业:')]/a/text()").extract()) item['company_size'] = try_get_value_from_array( list_items.xpath( u'.//*[contains(text(), "公司规模:")]//text()').extract()) item['company_address'] = try_get_value_from_array( list_items.xpath( u'.//*[contains(text(), "公司地址:")]/../text()').extract())
def parse_other_info(root, item): list_items = root.xpath("./ul") item['department'] = try_get_value_from_array( list_items.xpath( u'.//span[text()="所属部门:"]/../label/text()').extract()) item['major'] = try_get_value_from_array( list_items.xpath( u'.//span[text()="专业要求:"]/../label/text()').extract()) item['supervisor'] = try_get_value_from_array( list_items.xpath( u'.//span[text()="汇报对象:"]/../label/text()').extract()) item['subordinate'] = try_get_value_from_array( list_items.xpath( u'.//span[text()="下属人数:"]/../label/text()').extract())
def parse_company_info(root, item): list_items = root.xpath('.') item['company_name'] = try_get_value_from_array( list_items.xpath(u'.//*[text()="公司名称:"]/../a/text()').extract()) item['company_trade'] = try_get_value_from_array( list_items.xpath( u'//*[text()="所属行业:"]/following-sibling::span/text()').extract( )) item['company_type'] = try_get_value_from_array( list_items.xpath(u'.//*[text()="公司性质:"]/../text()').extract()) item['company_size'] = try_get_value_from_array( list_items.xpath(u'.//*[text()="公司规模:"]/../text()').extract()) item['company_address'] = try_get_value_from_array( list_items.xpath( u'.//*[text()="公司地址:"]/../span[@title]/text()').extract())
def parse_company_detail_info(root, item): list_items = root.xpath( "./following-sibling::div/div[@class='info-word']/text()") company_detail = try_get_value_from_array(list_items.extract()) if company_detail is not None: company_detail = company_detail.replace(r' ', r'') item['company_detail'] = company_detail
def parse_basic_require(root, item): list_items = root.xpath("//div[@class='job-title-left']") item['work_place'] = try_get_value_from_array( list_items.xpath('.//a/text()').extract()) item['salary'] = try_get_value_from_array( list_items.xpath('./p/text()').extract()) # 判断span数量 item['education'] = try_get_value_from_array( list_items.xpath(u'.//span[1]/text()').extract()) item['experience'] = try_get_value_from_array( list_items.xpath(u'.//span[2]/text()').extract()) item['language'] = try_get_value_from_array( list_items.xpath(u'.//span[3]/text()').extract()) item['age'] = try_get_value_from_array( list_items.xpath(u'.//span[4]/text()').extract())
def parse_params_to_url(self, response): root_node = response.xpath(".//div[@class='search-conditions']") if self.i < len(self.params): for k, v in self.params[self.i].items(): param_type = root_node.xpath( u".//*[@class='search-title'][contains(text(), '{}')]/following-sibling::dd" .format(k)) url = try_get_value_from_array( param_type.xpath( u".//a[contains(text(), '{}')]/@href".format( v)).extract()) self.i += 1 self.new_url = urljoin(self.base_url, url) # self.new_url = response.urljoin(url) # print self.new_url, 'self.new_url' return Request(self.new_url, callback=self.parse_params_to_url) else: return Request(self.new_url, callback=self.go_last_page, dont_filter=True)
def parse_other_requirement_info(root, item): list_items = root.xpath('./div') item['experience'] = try_get_value_from_array( list_items.xpath( u'./ul/li/p[text()="工作经验:"]/../span/text()').extract()) item['education'] = try_get_value_from_array( list_items.xpath(u'./ul/li/p[text()="学历要求:"]/../text()').extract()) item['age'] = try_get_value_from_array( list_items.xpath( u'./ul/li/span[text()="年龄:"]/../text()').extract()) item['full_time'] = try_get_value_from_array( list_items.xpath( u'./ul/li/span[text()="是否统招全日制:"]/../text()').extract()) item['major'] = try_get_value_from_array( list_items.xpath( u'./div[text()="专业要求:"]/../div/p/text()').extract()) item['oversea'] = try_get_value_from_array( list_items.xpath(u'./*[text()="海外经历:"]/../div/p/text()').extract()) item['language'] = try_get_value_from_array( list_items.xpath( u'./div[text()="语言要求:"]/../div/p/text()').extract())
def parse_basic_info(root, item): list_items = root.xpath(u'./ul/li') item['department'] = try_get_value_from_array( list_items.xpath(u'./span[text()="所属部门:"]/../text()').extract()) item['job_type'] = try_get_value_from_array( list_items.xpath(u'./span[text()="职位类别:"]/../text()').extract()) item['subordinate'] = try_get_value_from_array( list_items.xpath(u'./span[text()="下属人数:"]/../text()').extract()) item['requirement'] = try_get_value_from_array( list_items.xpath(u'./span[text()="招聘人数:"]/../text()').extract()) item['work_place'] = try_get_value_from_array( list_items.xpath(u'./span[text()="工作地点:"]/../a/text()').extract()) item['publish_time'] = try_get_value_from_array( list_items.xpath( u'./span[text()="发布时间:"]/../*[2]/text()').extract()) item['supervisor'] = try_get_value_from_array( list_items.xpath(u'./span[text()="汇报对象:"]/../text()').extract())
def go_last_page(self, response): last_page_url = try_get_value_from_array( response.xpath("//a[@class='last']/@href").extract()) return Request(urljoin(self.base_url, last_page_url), callback=self.get_total_page, dont_filter=True)
def get_next_page_url(self, response): next_page_url = try_get_value_from_array( response.xpath(u"//a[contains(text(), '下一页')]/@href").extract()) return urljoin(self.base_url, next_page_url)
def parse_job_detail_info(root, item): list_items = root.xpath('string(.)') item['job_detail'] = try_get_value_from_array(list_items.extract())
def parse_company_detail_info(root, item): list_items = root.xpath('string(.)') item['company_detail'] = try_get_value_from_array( list_items.extract()).replace(r' ', r'')
def parse_salary_info(root, item): list_items = root.xpath(u'./ul/li') item['salary'] = try_get_value_from_array( list_items.xpath( u'./span[text()="年薪范围:"]/../span/a/text()').extract())