def parse_node(self, response, node): # self.logger.info('Hi, this is a <%s> node!: %s', self.itertag, ''.join(node.getall())) item = JobItem() posted_date = node.xpath('.//x:published/text()').get() # Convert to datetime format mm/dd/yyyy posted_date = datetime.fromisoformat(posted_date) timezone_info = posted_date.tzinfo posted_date_string = posted_date.strftime('%m/%d/%Y') now = datetime.now(tz=timezone_info) is_posted_in_the_past_five_days = (now - posted_date).days <= 10 if not is_posted_in_the_past_five_days: return title = node.xpath('.//x:title/text()').get() school, _, title = title.partition(':') school, title = map(str.strip, [school, title]) details_url = node.xpath('.//x:link[@rel="alternate"]/@href').get() ads_source = f'=hyperlink("{details_url}","ChemPostingCanada")' recruiter = f'=hyperlink("{details_url}","{school}")' ads_content = node.xpath('.//x:content/text()').get() # self.logger.info(f'{ads_content=}') ads_content_text_only = re.sub('<[^<]+?>', ' ', ads_content) self.logger.info(f'{ads_content_text_only=}') # Get specialization specialization = set(word.lower() for word in re.findall( r'org\w*|anal\w*|inorg\w*|bio(?!chemical\b)\w+|physic\w*|polymer\w*', ads_content_text_only, re.IGNORECASE)) specialization = ', '.join(specialization) # Get the ranking (using the job description) rank = set( re.findall(r'Assistant\b|Associate\b|Full\s', ads_content_text_only)) rank_text = '/'.join(word.strip().lower().replace( 'assistant', 'asst').replace('associate', 'assoc') for word in rank) tenure_type = re.search(r'\S*tenure\S*', ads_content_text_only, re.IGNORECASE) # print(f'{tenure_type=}') comments1 = tenure_type[0] if tenure_type else None # self.logger.info(f'{item=}') item.update({ 'posted_date': posted_date_string, 'ads_title': title, 'canada': 'yes', 'ads_source': ads_source, 'school': recruiter, 'specialization': specialization, 'rank': rank_text, 'comments1': comments1, }) return item
def parse_node(self, response, node): item = JobItem() item['title'] = node.xpath('//title[@lang="en"]/text()').extract() item['author'] = node.xpath('//author/text()').extract() item['year'] = node.xpath('//year/text()').extract() item['price'] = node.xpath('//price/text()').extract() return item
def parse(self, response): products = response.xpath('//ul[@class="sojob-list"]//li') for product in products: item = JobItem() item['name'] = product.xpath( './/div[@class="sojob-item-main clearfix"]//div[@class="job-info"]//h3//a//text()' ).extract_first().strip() item['company'] = product.xpath( './/div[@class="sojob-item-main clearfix"]//div[@class="company-info nohover"]//p[@class="company-name"]//a//text()' ).extract_first() item['area'] = product.xpath( './/div[@class="sojob-item-main clearfix"]//div[@class="job-info"]//p[@class="condition clearfix"]//a[@class="area"]//text()' ).extract_first() item['salary'] = product.xpath( './/div[@class="sojob-item-main clearfix"]//div[@class="job-info"]//p[@class="condition clearfix"]//span[@class="text-warning"]//text()' ).extract_first() item['publishdate'] = product.xpath( './/div[@class="sojob-item-main clearfix"]//div[@class="job-info"]//p[@class="time-info clearfix"]//time/@title' ).extract_first() item['link'] = product.xpath( './/div[@class="sojob-item-main clearfix"]//div[@class="job-info"]//h3//a/@href' ).extract_first() if(not item['link'].startswith('https')): item['link'] = 'https://www.liepin.com' + item['link'] item['source'] = self.displayname item['downloaddate'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") yield item
def parse_ads(self, response, **cb_kwargs): online_application_title_field = response.xpath( './/*[@id="jobApplyInfo"]//*[contains(@class, "field-label")][contains(normalize-space(text()), "Online App. Form")]' ) online_application_url = online_application_title_field.xpath( './following-sibling::div[1]/a/@data-orig-href').get() # Update the school field to embed the link to the online app if exists (following Chemjobber List format) application_url = online_application_url or response.url cb_kwargs[ 'school'] = f'=hyperlink("{application_url}","{cb_kwargs["school"]}")' # print(f'{cb_kwargs=}') job_description = ' '.join( word.strip() for word in (response.css('#jobDesc *::text').getall()) if re.search(r'\S', word)) # print(f'{job_description=}') # Get the ranking (using the job description) rank = set( re.findall(r'Assistant\b|Associate\b|Full\s', job_description)) rank_text = '/'.join(word.lower().replace('assistant', 'asst').replace( 'associate', 'assoc') for word in rank) # print(f'{rank_text=}') cb_kwargs['rank'] = rank_text or cb_kwargs['rank'] tenure_type = re.search(r'\S*tenure\S*', job_description, re.IGNORECASE) # print(f'{tenure_type=}') comments1 = tenure_type[0] if tenure_type else None cb_kwargs['comments1'] = comments1 yield JobItem(cb_kwargs)
def parse_redirect_application_url(self, response, **cb_kwargs): """ Get the redirect url to the application url """ application_url = response.url or response.request.url # print(f'{application_url=}') cb_kwargs[ 'school'] = f'=hyperlink("{application_url}","{cb_kwargs["school"]}")' yield JobItem(cb_kwargs)
def parse_job(self, response): for data in response.css('li.con_list_item'): experience = data.xpath("//div[@class='li_b_l']/text()").re_first( r'\S.*') experience = re.findall('\d+', experience) experience_list = experience if experience else [0, 0] experience_list = experience if len(experience) > 1 else [0, 1] if len(experience) == 1: experience_list = [0, 1] item = JobItem( title=data.xpath('//h3/text()').extract_first(), city=data.xpath('//em/text()').re_first(r'\w+'), salary_lower=data.xpath('//span[@class="money"]/text()' ).extract_first().split('-')[0], salary_upper=data.xpath('//span[@class="money"]/text()' ).extract_first().split('-')[1], experience_lower=experience_list[0], experience_upper=experience_list[1], education=data.xpath("//div[@class='li_b_l']/text()").re_first( r'\S.*').split('/')[1].strip(), tags=' '.join( data.css('div.list_item_bot span::text').extract()), company=data.xpath("//div[@class='company_name']/a/text()" ).extract_first().strip()) yield item
def parse_job(self, response): item_loader = JobItemLoader(item=JobItem(), response=response) item_loader.add_css("title", ".job-primary .info-primary .name h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job-primary .info-primary .name .salary::text") item_loader.add_css("job_desc", ".detail-content .job-sec .text") item_loader.add_css("job_addr", ".job-primary .info-primary p") job_item = item_loader.load_item() return job_item # 返回的job_item将传入pipeline
def parse_job(self, response): item_loader = JobItemLoader(item=JobItem(), response=response) item_loader.add_css("title", ".job-title.clearfix .job-name::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job-brief .job-info .salary::text") item_loader.add_css("job_desc", ".job-desc") item_loader.add_css("job_addr", ".job-brief .job-info .where::text") job_item = item_loader.load_item() return job_item # 返回的job_item将传入pipeline
def parse_job(self, response): """ 解析拉勾网的职位 :param response: :return: """ item_loader = JobItemLoader(item=JobItem(), response=response) item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css( 'title', 'body > div.position-head > div > div.position-content-l > div > span::text' ) item_loader.add_xpath( 'salary', '/html/body/div[2]/div/div[1]/dd/p[1]/span[1]/text()') item_loader.add_xpath( 'job_city', '/html/body/div[2]/div/div[1]/dd/p[1]/span[2]/text()') item_loader.add_xpath( 'work_years', '/html/body/div[2]/div/div[1]/dd/p[1]/span[3]/text()') item_loader.add_xpath( 'degree_need', '/html/body/div[2]/div/div[1]/dd/p[1]/span[4]/text()') item_loader.add_xpath( 'job_type', '/html/body/div[2]/div/div[1]/dd/p[1]/span[5]/text()') item_loader.add_css( 'publish_time', 'body > div.position-head > div > div.position-content-l > dd > p.publish_time::text' ) item_loader.add_css( 'tags', 'body > div.position-head > div > div.position-content-l > dd > ul > li::text' ) item_loader.add_css('job_advantage', '#job_detail > dd.job-advantage > p::text') item_loader.add_css('job_desc', '#job_detail > dd.job_bt > div') item_loader.add_css( 'job_addr', '#job_detail > dd.job-address.clearfix > div.work_addr') item_loader.add_css('company_url', '#job_company > dt > a::attr(href)') item_loader.add_css('company_name', '#job_company > dt > a > img::attr(alt)') item_loader.add_value('crawl_time', datetime.now()) job_item = item_loader.load_item() return job_item
def parse(self, response): products = response.xpath( '//div[@id="resultList"]//div[@class="el" or @class="el mk"]') for product in products: item = JobItem() item['name'] = ''.join( product.xpath('.//p[contains(@class, "t1")]//span//a/@title'). extract()).strip() item['company'] = ''.join( product.xpath('.//span[contains(@class, "t2")]//a/@title'). extract()).strip() item['area'] = product.xpath( './/span[contains(@class, "t3")]//text()').extract_first() item['salary'] = product.xpath( './/span[contains(@class, "t4")]//text()').extract_first() item['publishdate'] = product.xpath( './/span[contains(@class, "t5")]//text()').extract_first() item['link'] = product.xpath( './/p[contains(@class, "t1")]//span//a/@href').extract_first() item['source'] = self.name item['downloaddate'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") yield item