def parse_node(self, response, node):
        # self.logger.info('Hi, this is a <%s> node!: %s', self.itertag, ''.join(node.getall()))

        item = JobItem()

        posted_date = node.xpath('.//x:published/text()').get()
        #  Convert to datetime format mm/dd/yyyy
        posted_date = datetime.fromisoformat(posted_date)
        timezone_info = posted_date.tzinfo
        posted_date_string = posted_date.strftime('%m/%d/%Y')
        now = datetime.now(tz=timezone_info)
        is_posted_in_the_past_five_days = (now - posted_date).days <= 10

        if not is_posted_in_the_past_five_days:
            return

        title = node.xpath('.//x:title/text()').get()
        school, _, title = title.partition(':')
        school, title = map(str.strip, [school, title])

        details_url = node.xpath('.//x:link[@rel="alternate"]/@href').get()
        ads_source = f'=hyperlink("{details_url}","ChemPostingCanada")'
        recruiter = f'=hyperlink("{details_url}","{school}")'

        ads_content = node.xpath('.//x:content/text()').get()
        # self.logger.info(f'{ads_content=}')
        ads_content_text_only = re.sub('<[^<]+?>', ' ', ads_content)
        self.logger.info(f'{ads_content_text_only=}')

        # Get specialization
        specialization = set(word.lower() for word in re.findall(
            r'org\w*|anal\w*|inorg\w*|bio(?!chemical\b)\w+|physic\w*|polymer\w*',
            ads_content_text_only, re.IGNORECASE))
        specialization = ', '.join(specialization)

        # Get the ranking (using the job description)
        rank = set(
            re.findall(r'Assistant\b|Associate\b|Full\s',
                       ads_content_text_only))
        rank_text = '/'.join(word.strip().lower().replace(
            'assistant', 'asst').replace('associate', 'assoc')
                             for word in rank)

        tenure_type = re.search(r'\S*tenure\S*', ads_content_text_only,
                                re.IGNORECASE)
        # print(f'{tenure_type=}')
        comments1 = tenure_type[0] if tenure_type else None

        # self.logger.info(f'{item=}')
        item.update({
            'posted_date': posted_date_string,
            'ads_title': title,
            'canada': 'yes',
            'ads_source': ads_source,
            'school': recruiter,
            'specialization': specialization,
            'rank': rank_text,
            'comments1': comments1,
        })
        return item
 def parse_node(self, response, node):
     item = JobItem()
     item['title'] = node.xpath('//title[@lang="en"]/text()').extract()
     item['author'] = node.xpath('//author/text()').extract()
     item['year'] = node.xpath('//year/text()').extract()
     item['price'] = node.xpath('//price/text()').extract()
     return item
Example #3
0
 def parse(self, response):
     products = response.xpath('//ul[@class="sojob-list"]//li')
     for product in products:
         item = JobItem()
         item['name'] = product.xpath(
             './/div[@class="sojob-item-main clearfix"]//div[@class="job-info"]//h3//a//text()'
         ).extract_first().strip()
         item['company'] = product.xpath(
             './/div[@class="sojob-item-main clearfix"]//div[@class="company-info nohover"]//p[@class="company-name"]//a//text()'
         ).extract_first()
         item['area'] = product.xpath(
             './/div[@class="sojob-item-main clearfix"]//div[@class="job-info"]//p[@class="condition clearfix"]//a[@class="area"]//text()'
         ).extract_first()
         item['salary'] = product.xpath(
             './/div[@class="sojob-item-main clearfix"]//div[@class="job-info"]//p[@class="condition clearfix"]//span[@class="text-warning"]//text()'
         ).extract_first()
         item['publishdate'] = product.xpath(
             './/div[@class="sojob-item-main clearfix"]//div[@class="job-info"]//p[@class="time-info clearfix"]//time/@title'
         ).extract_first()
         item['link'] = product.xpath(
             './/div[@class="sojob-item-main clearfix"]//div[@class="job-info"]//h3//a/@href'
         ).extract_first()
         if(not item['link'].startswith('https')):
             item['link'] = 'https://www.liepin.com' + item['link']
         item['source'] = self.displayname
         item['downloaddate'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
         yield item
Example #4
0
    def parse_ads(self, response, **cb_kwargs):
        online_application_title_field = response.xpath(
            './/*[@id="jobApplyInfo"]//*[contains(@class, "field-label")][contains(normalize-space(text()), "Online App. Form")]'
        )
        online_application_url = online_application_title_field.xpath(
            './following-sibling::div[1]/a/@data-orig-href').get()

        # Update the school field to embed the link to the online app if exists (following Chemjobber List format)
        application_url = online_application_url or response.url
        cb_kwargs[
            'school'] = f'=hyperlink("{application_url}","{cb_kwargs["school"]}")'
        # print(f'{cb_kwargs=}')

        job_description = ' '.join(
            word.strip()
            for word in (response.css('#jobDesc *::text').getall())
            if re.search(r'\S', word))
        # print(f'{job_description=}')

        # Get the ranking (using the job description)
        rank = set(
            re.findall(r'Assistant\b|Associate\b|Full\s', job_description))
        rank_text = '/'.join(word.lower().replace('assistant', 'asst').replace(
            'associate', 'assoc') for word in rank)
        # print(f'{rank_text=}')
        cb_kwargs['rank'] = rank_text or cb_kwargs['rank']

        tenure_type = re.search(r'\S*tenure\S*', job_description,
                                re.IGNORECASE)
        # print(f'{tenure_type=}')
        comments1 = tenure_type[0] if tenure_type else None

        cb_kwargs['comments1'] = comments1

        yield JobItem(cb_kwargs)
Example #5
0
 def parse_redirect_application_url(self, response, **cb_kwargs):
     """ Get the redirect url to the application url """
     application_url = response.url or response.request.url
     # print(f'{application_url=}')
     cb_kwargs[
         'school'] = f'=hyperlink("{application_url}","{cb_kwargs["school"]}")'
     yield JobItem(cb_kwargs)
Example #6
0
 def parse_job(self, response):
     for data in response.css('li.con_list_item'):
         experience = data.xpath("//div[@class='li_b_l']/text()").re_first(
             r'\S.*')
         experience = re.findall('\d+', experience)
         experience_list = experience if experience else [0, 0]
         experience_list = experience if len(experience) > 1 else [0, 1]
         if len(experience) == 1:
             experience_list = [0, 1]
         item = JobItem(
             title=data.xpath('//h3/text()').extract_first(),
             city=data.xpath('//em/text()').re_first(r'\w+'),
             salary_lower=data.xpath('//span[@class="money"]/text()'
                                     ).extract_first().split('-')[0],
             salary_upper=data.xpath('//span[@class="money"]/text()'
                                     ).extract_first().split('-')[1],
             experience_lower=experience_list[0],
             experience_upper=experience_list[1],
             education=data.xpath("//div[@class='li_b_l']/text()").re_first(
                 r'\S.*').split('/')[1].strip(),
             tags=' '.join(
                 data.css('div.list_item_bot span::text').extract()),
             company=data.xpath("//div[@class='company_name']/a/text()"
                                ).extract_first().strip())
         yield item
Example #7
0
    def parse_job(self, response):
        item_loader = JobItemLoader(item=JobItem(), response=response)
        item_loader.add_css("title", ".job-primary .info-primary .name h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job-primary .info-primary .name .salary::text")
        item_loader.add_css("job_desc", ".detail-content .job-sec .text")
        item_loader.add_css("job_addr", ".job-primary .info-primary p")
        job_item = item_loader.load_item()

        return job_item     #  返回的job_item将传入pipeline
Example #8
0
    def parse_job(self, response):
        item_loader = JobItemLoader(item=JobItem(), response=response)
        item_loader.add_css("title", ".job-title.clearfix .job-name::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job-brief .job-info .salary::text")
        item_loader.add_css("job_desc", ".job-desc")
        item_loader.add_css("job_addr", ".job-brief .job-info .where::text")
        job_item = item_loader.load_item()

        return job_item     # 返回的job_item将传入pipeline
Example #9
0
 def parse_job(self, response):
     """
     解析拉勾网的职位
     :param response:
     :return:
     """
     item_loader = JobItemLoader(item=JobItem(), response=response)
     item_loader.add_value('url', response.url)
     item_loader.add_value('url_object_id', get_md5(response.url))
     item_loader.add_css(
         'title',
         'body > div.position-head > div > div.position-content-l > div > span::text'
     )
     item_loader.add_xpath(
         'salary', '/html/body/div[2]/div/div[1]/dd/p[1]/span[1]/text()')
     item_loader.add_xpath(
         'job_city', '/html/body/div[2]/div/div[1]/dd/p[1]/span[2]/text()')
     item_loader.add_xpath(
         'work_years',
         '/html/body/div[2]/div/div[1]/dd/p[1]/span[3]/text()')
     item_loader.add_xpath(
         'degree_need',
         '/html/body/div[2]/div/div[1]/dd/p[1]/span[4]/text()')
     item_loader.add_xpath(
         'job_type', '/html/body/div[2]/div/div[1]/dd/p[1]/span[5]/text()')
     item_loader.add_css(
         'publish_time',
         'body > div.position-head > div > div.position-content-l > dd > p.publish_time::text'
     )
     item_loader.add_css(
         'tags',
         'body > div.position-head > div > div.position-content-l > dd > ul > li::text'
     )
     item_loader.add_css('job_advantage',
                         '#job_detail > dd.job-advantage > p::text')
     item_loader.add_css('job_desc', '#job_detail > dd.job_bt > div')
     item_loader.add_css(
         'job_addr',
         '#job_detail > dd.job-address.clearfix > div.work_addr')
     item_loader.add_css('company_url', '#job_company > dt > a::attr(href)')
     item_loader.add_css('company_name',
                         '#job_company > dt > a > img::attr(alt)')
     item_loader.add_value('crawl_time', datetime.now())
     job_item = item_loader.load_item()
     return job_item
Example #10
0
 def parse(self, response):
     products = response.xpath(
         '//div[@id="resultList"]//div[@class="el" or @class="el mk"]')
     for product in products:
         item = JobItem()
         item['name'] = ''.join(
             product.xpath('.//p[contains(@class, "t1")]//span//a/@title').
             extract()).strip()
         item['company'] = ''.join(
             product.xpath('.//span[contains(@class, "t2")]//a/@title').
             extract()).strip()
         item['area'] = product.xpath(
             './/span[contains(@class, "t3")]//text()').extract_first()
         item['salary'] = product.xpath(
             './/span[contains(@class, "t4")]//text()').extract_first()
         item['publishdate'] = product.xpath(
             './/span[contains(@class, "t5")]//text()').extract_first()
         item['link'] = product.xpath(
             './/p[contains(@class, "t1")]//span//a/@href').extract_first()
         item['source'] = self.name
         item['downloaddate'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
         yield item