Python JobSourceの例、pyjobs_crawlers.spiders.JobSource Pythonの例

コード例 #1

0

ファイルを表示

ファイル: lesjeudis.py プロジェクト: pyjobs/crawlers

        'from_page_enabled': True,
        'from_list__next_page__css': 'ul.pagination li.arrow a::attr(href)',

        'from_list__jobs_lists__css': 'body',
        'from_list__jobs__css': 'div[itemtype="http://schema.org/JobPosting"]',
        'from_list__url__css': 'div#job-title h2 a::attr(href)',
        'from_list__title__css': 'div#job-title h2 a::text',
        'from_list__publication_datetime__css': 'span[itemprop="datePosted"]',
        'from_list__tags__css': 'p[itemprop="skills"] a::text',
        'from_list__address__css': 'span[itemprop="jobLocation"]::text',
        'from_list__company__css': 'div[itemprop="hiringOrganization"] a::text',
        # 'from_list__company_url__css': 'div[itemprop="hiringOrganization"] a::attr(href)',

        'from_page__description__css': 'div.job-content',
        'from_page__publication_datetime__css': 'p.info span:nth-child(2)',
    }

    def _get_from_list__publication_datetime(self, node):
        return datetime.now()

    def _get_from_page__publication_datetime(self, node):
        raw_date = self._extract_first(node, 'from_page__publication_datetime')
        if raw_date:  # La date est sous la forme "24 août 2015"
            raw_date_english = self._month_french_to_english(raw_date)  # On lma converti en Anglais
            return datetime.strptime(raw_date_english, '%d %B %Y')  # On extrait la date de ce texte
        return datetime.now()

# N'oubliez pas cette ligne
source = JobSource.from_job_spider(LesJeudisSpider)

コード例 #2

0

ファイルを表示

            company_info.css('.website > a::attr(href)').extract())
        # description = job_node.css('div.job-description').extract()

        tags_html = self._extract_all(job_node,
                                      'from_page__tags',
                                      required=False)
        if tags_html:
            item['tags'] = [Tag(tag, 1) for tag in tags_html]

        # 19 mars 2015,
        #
        # thedate_xpath = './div[@id="content-core"]/div[@id="content-core"]/div[@class="discreet"]/text()'
        # # print 'DATE IS', job_node.xpath(thedate_xpath)[0].extract().strip().splitlines()
        # thedate = job_node.xpath(thedate_xpath)[0].extract().strip().splitlines()[0].strip().replace(u'Créé le ', '')
        # # Now date is formatted as "14/10/2015 13:17"
        # thedatetime = datetime.strptime(thedate, '%d/%m/%Y %H:%M')
        # publication_datetime = thedatetime

        item['address'] = address
        item['url'] = url  # used as uid
        item['source'] = self.name
        item['company'] = company
        item['company_url'] = company_url
        item['initial_crawl_datetime'] = datetime.now()
        item['status'] = JobItem.CrawlStatus.COMPLETED

        yield item


source = JobSource.from_job_spider(RemixJobsSpider)

コード例 #3

0

ファイルを表示

ファイル: linuxjobs.py プロジェクト: CkuT/crawlers

        ),
        'from_page__address__css': 'div.container div.row:nth-child(2) .col-md-9 h4:nth-child(4)::text',
        'from_page__description__css': 'div.container div.row:nth-child(4) div.col-md-9',
        'from_page__tags__css': 'div.container div.row:nth-child(4) div.col-md-9'
    }

    def _get_from_list__jobs(self, node):
        jobs = super(LinuxJobsSpider, self)._get_from_list__jobs(node)
        if jobs:
            return jobs[::-1]  # Reverse jobs list (they are in asc order)
        return jobs

    def _get_from_list__url(self, jobs_node):
        if len(jobs_node.css('h4')) < 1: # If no h4, then, this is not a job
            raise NotCrawlable()
        return super(LinuxJobsSpider, self)._get_from_list__url(jobs_node)

    def _get_from_page__publication_datetime(self, job_container):
        publication_datetime_str = self._extract_first(job_container, 'from_page__publication_datetime')
        publication_datetime_str = publication_datetime_str.replace(u'Ajout\xe9e le', '')
        publication_datetime_str_english = self._month_french_to_english(publication_datetime_str)
        return datetime.strptime(publication_datetime_str_english, '%d %B %Y')

    def _get_from_page__address(self, job_container):
        address = super(LinuxJobsSpider, self)._get_from_page__address(job_container)
        if address:
            return re.sub(r'\([^)]*\)', '', address).strip() #  address is like Paris (programmeurs)
        return None

source = JobSource.from_job_spider(LinuxJobsSpider)

コード例 #4

0

ファイルを表示

ファイル: urbanlinker.py プロジェクト: BenoitEchernier/crawlers

from pyjobs_crawlers.spiders import JobSpider, JobSource


class UrbanLinkerSpider(JobSpider):

    name = 'urbanlinker'
    start_urls = ['http://www.urbanlinker.com/offresdemploi/motcle/python/']
    label = 'Urban Linker'
    url = 'http://www.urbanlinker.com/'
    logo_url = 'http://www.urbanlinker.com/wp-content/themes/urbanlinker/images/logo-new.jpg'

    _crawl_parameters = {
        'from_page_enabled': True,
        'from_list__jobs_lists__css': '#contentoffres',
        'from_list__jobs__css': 'article.post',
        'from_list__url__css': 'h2.title-article a::attr(href)',
        'from_list__next_page__css': 'ul.bottomnav-content li.last a::attr(href)',
        'from_list__title__css': 'h2.title-article h2 a::text',
        'from_list__publication_datetime__css': '.post-info time::attr(datetime)',
        'from_page__container__css': 'article.post',
        'from_page__title__css': 'h1.title-job::text',
        'from_page__description__css': 'div.post-content',
        'from_page__address__css': 'header h1 + span::text',
    }

    def _get_from_list__publication_datetime(self, node):
        return self._extract_first(node, 'from_list__publication_datetime', required=False)

# N'oubliez pas cette ligne
source = JobSource.from_job_spider(UrbanLinkerSpider)

コード例 #5

0

ファイルを表示

    name = 'urbanlinker'
    start_urls = ['http://www.urbanlinker.com/offresdemploi/motcle/python/']
    label = 'Urban Linker'
    url = 'http://www.urbanlinker.com/'
    logo_url = 'http://www.urbanlinker.com/wp-content/themes/urbanlinker/images/logo-new.jpg'

    _crawl_parameters = {
        'from_page_enabled': True,
        'from_list__jobs_lists__css': '#contentoffres',
        'from_list__jobs__css': 'article.post',
        'from_list__url__css': 'h2.title-article a::attr(href)',
        'from_list__next_page__css':
        'ul.bottomnav-content li.last a::attr(href)',
        'from_list__title__css': 'h2.title-article h2 a::text',
        'from_list__publication_datetime__css':
        '.post-info time::attr(datetime)',
        'from_page__container__css': 'article.post',
        'from_page__title__css': 'h1.title-job::text',
        'from_page__description__css': 'div.post-content',
        'from_page__address__css': 'header h1 + span::text',
    }

    def _get_from_list__publication_datetime(self, node):
        return self._extract_first(node,
                                   'from_list__publication_datetime',
                                   required=False)


# N'oubliez pas cette ligne
source = JobSource.from_job_spider(UrbanLinkerSpider)

コード例 #6

0

ファイルを表示

ファイル: lolix.py プロジェクト: BenoitEchernier/crawlers

        satisfying = super(LolixJobSpider, self)._item_satisfying(item)
        if satisfying:
            # fixme
            if item['title'].lower().find('python') <= 0:
                return False
        return satisfying

    def address_forbidden_content(self):
        return [
            u'Tél',
            u'offre',
            u'Administration',
            u'BTP',
            u'Enseignement',
            u'Industrie',
            u'Informatique',
            u'Recherche',
            u'Editeur',
            u'Internet',
            u'SSII'
        ]

    def match_str(self, string, forbidden_string_items):
        for forbidden_item in forbidden_string_items:
            if string.find(forbidden_item) >= 0:
                return True

        return False

source = JobSource.from_job_spider(LolixJobSpider)

コード例 #7

0

ファイルを表示

ファイル: linuxjobs.py プロジェクト: BenoitEchernier/crawlers

        jobs = super(LinuxJobsSpider, self)._get_from_list__jobs(node)
        if jobs:
            return jobs[::-1]  # Reverse jobs list (they are in asc order)
        return jobs

    def _get_from_list__url(self, jobs_node):
        if len(jobs_node.css('h4')) < 1:  # If no h4, then, this is not a job
            raise NotCrawlable()
        return super(LinuxJobsSpider, self)._get_from_list__url(jobs_node)

    def _get_from_page__publication_datetime(self, job_container):
        publication_datetime_str = self._extract_first(
            job_container, 'from_page__publication_datetime')
        publication_datetime_str = publication_datetime_str.replace(
            u'Ajout\xe9e le', '')
        publication_datetime_str_english = self._month_french_to_english(
            publication_datetime_str)
        return datetime.strptime(publication_datetime_str_english, '%d %B %Y')

    def _get_from_page__address(self, job_container):
        address = super(LinuxJobsSpider,
                        self)._get_from_page__address(job_container)
        if address:
            return re.sub(
                r'\([^)]*\)', '',
                address).strip()  #  address is like Paris (programmeurs)
        return None


source = JobSource.from_job_spider(LinuxJobsSpider)

コード例 #8

0

ファイルを表示

ファイル: remixjobs.py プロジェクト: algoo/crawlers

        # company_url = job_infos.xpath('./li[1]/a/@href').extract_first().strip()

        address = job_infos.xpath('./li[4]/text()').extract_first().strip().rstrip(',')
        # description = job_node.css('div.job-description').extract()

        tags_html = self._extract_first(job_node, 'from_page__tags', required=False)
        if tags_html:
            item['tags'] = self.extract_tags(tags_html)

        # 19 mars 2015,
        #
        # thedate_xpath = './div[@id="content-core"]/div[@id="content-core"]/div[@class="discreet"]/text()'
        # # print 'DATE IS', job_node.xpath(thedate_xpath)[0].extract().strip().splitlines()
        # thedate = job_node.xpath(thedate_xpath)[0].extract().strip().splitlines()[0].strip().replace(u'Créé le ', '')
        # # Now date is formatted as "14/10/2015 13:17"
        # thedatetime = datetime.strptime(thedate, '%d/%m/%Y %H:%M')
        # publication_datetime = thedatetime


        item['address'] = address
        item['url'] = url  # used as uid
        item['source'] = self.name
        item['company'] = company
        # item['company_url'] = company_url
        item['initial_crawl_datetime'] = datetime.now()
        item['status'] = JobItem.CrawlStatus.COMPLETED

        yield item

source = JobSource.from_job_spider(RemixJobsSpider)

コード例 #9

0

ファイルを表示

ファイル: pole_emploi.py プロジェクト: lebouquetin/crawlers

    logo_url = u'http://www.pole-emploi.fr/accueil/image/site/logo/logo-pole-emploi_region.png'

    _crawl_parameters = {
        'from_page_enabled': True,
        'from_list__jobs_lists__css': 'div#offrescartezone div.result-page table.definition-table',
        'from_list__jobs__css': 'tr[itemtype="http://schema.org/JobPosting"]',
        'from_list__url__css': 'a::attr(href)',
        'from_list__title__css': 'a.title::text',
        'from_list__company__css': 'span.company span[itemprop=name]::text',
        'from_list__next_page__css': None,
        # FIXME - D.A. - 2016-02-19 - next page is protected by javascript
        # This is not a problem for us (we crawl every 15 minutes
        'from_page__container__css': '#offre-body',
        'from_page__title__css': 'h4[itemprop=title]',
        'from_page__publication_datetime__css': 'span[itemprop=datePosted]::text',
        'from_page__company__css': '#second h3.nom::text',
        'from_page__address__css': 'li[itemprop=addressRegion]::text',
        'from_page__description__css': '#offre-body div',
        'from_page__tags__css': 'p[itemprop=description]::text',
    }

    def _get_from_page__publication_datetime(self, job_node):
        date_text = self._extract_first(job_node, 'from_page__publication_datetime')
        if date_text:
            return datetime.strptime(date_text, '%d/%m/%Y')
        return super(PoleEmploiSpider, self)._get_from_page__publication_datetime(job_node)


# N'oubliez pas cette ligne
source = JobSource.from_job_spider(PoleEmploiSpider)

コード例 #10

0

ファイルを表示

        'from_page__address__xpath':
        './/h4[1]/following-sibling::div[@class="row"]/text()',
        'from_page__description__css':
        '#content',
        'from_page__tags__xpath':
        './div[@id="content-core"]/div[@id="content-core"]'
    }

    def _get_from_page__publication_datetime(self, job_container):
        try:
            publication_date_text = self._extract_first(
                job_container, 'from_page__publication_datetime')
            if publication_date_text:
                publication_date_text_clean = publication_date_text.replace(
                    u'Créé le ', '')
                return datetime.strptime(publication_date_text_clean,
                                         '%d/%m/%Y %H:%M')
            return super(
                AfpyJobSpider,
                self)._get_from_page__publication_datetime(job_container)
        except Exception, exc:
            self.get_connector().log(
                self.name, self.ACTION_CRAWL_ERROR,
                "Error during publication date extraction: %s" % str(exc))
            return super(
                AfpyJobSpider,
                self)._get_from_page__publication_datetime(job_container)


source = JobSource.from_job_spider(AfpyJobSpider)

コード例 #11

0

ファイルを表示

ファイル: humancoders.py プロジェクト: pyjobs/crawlers

    name = 'human'
    start_urls = ['http://jobs.humancoders.com/python']
    label = 'Human coders'
    url = 'http://jobs.humancoders.com/'
    logo_url = 'http://jobs.humancoders.com/assets/logo-b2ddc104507a3e9f623788cf9278ba0e.png'

    _crawl_parameters = {
        'from_page_enabled': True,

        'from_list__jobs_lists__css': 'body',
        'from_list__jobs__css': 'li.job',
        'from_list__url__css': 'div.job_title h2 a::attr(href)',
        'from_list__title__css': 'div.job_title h2 a::text',
        'from_list__publication_datetime__css': 'div.date::text',
        'from_list__tags__css': 'ul.tags li p::text',
        'from_list__company__css': 'div.company span.company_name::text',
        'from_list__address__css': 'div.location::text',
        'from_page__container__css': 'body',
        'from_page__company_url__css': 'div.company_url a::attr(href)',
        'from_page__description__css': '#description'
    }

    def _get_from_list__publication_datetime(self, node):
        raw_date = self._extract_first(node, 'from_list__publication_datetime')
        if raw_date:  # La date est sous la forme "24 août 2015"
            raw_date_english = self._month_french_to_english(raw_date)  # On lma converti en Anglais
            return datetime.strptime(raw_date_english, '%d %B %Y')  # On extrait la date de ce texte

# N'oubliez pas cette ligne
source = JobSource.from_job_spider(HumanCodersSpider)

コード例 #12

0

ファイルを表示

ファイル: bluecoders.py プロジェクト: pyjobs/crawlers

    def _get_from_list__url(self, node):
        extracted_url = self._extract_first(node, 'from_list__url', required=True)
        return self._get_absolute_url(extracted_url.encode('utf-8'))

    def _get_from_list__tags(self, node):
        """
        Tags are hidden in img/alt ('alt="recrutement développeur python"')
        TODO : Must find another way to create tag instead of creating obj
        """

        raw_tags = self._extract_all(node, 'from_list__tags')

        if raw_tags:
            return [Tag(tag.split()[-1]) for tag in raw_tags]

        return True

    def _get_from_list__publication_datetime(self, node):
        """
            The datetime is humanized/natural (ex: "2 days ago")
            Our goal here is to parse to a datetime object
        """
        raw_date = self._extract_first(node, 'from_list__publication_datetime')
        if raw_date:
            cal = parsedatetime.Calendar()
            time_struct, parse_status = cal.parse(raw_date)
            return datetime(*time_struct[:6])


source = JobSource.from_job_spider(BlueCodersSpider)

コード例 #13

0

ファイルを表示

ファイル: lolix.py プロジェクト: pyjobs/crawlers

                    and not self.match_str(content,
                                           self.address_forbidden_content()):
                address += content + ', '

        return address

    def _item_satisfying(self, item):
        satisfying = super(LolixJobSpider, self)._item_satisfying(item)
        if satisfying:
            # fixme
            if item['title'].lower().find('python') <= 0:
                return False
        return satisfying

    def address_forbidden_content(self):
        return [
            u'Tél', u'offre', u'Administration', u'BTP', u'Enseignement',
            u'Industrie', u'Informatique', u'Recherche', u'Editeur',
            u'Internet', u'SSII'
        ]

    def match_str(self, string, forbidden_string_items):
        for forbidden_item in forbidden_string_items:
            if string.find(forbidden_item) >= 0:
                return True

        return False


source = JobSource.from_job_spider(LolixJobSpider)

コード例 #14

0

ファイルを表示

ファイル: alsacreations.py プロジェクト: pyjobs/crawlers

        'from_page__container__css': 'div.fiche',
        'from_page__title__css': '#premier h2[itemprop=title]::text',
        'from_page__publication_datetime__css': 'p.navinfo time::attr(datetime)',
        'from_page__company__css': '#second h3.nom::text',
        'from_page__company_url__css': '#second a[itemprop=url]::attr(href)',
        'from_page__address__css': '#premier b[itemprop=jobLocation]::text',
        'from_page__description__css': '#premier p[itemprop=description]',
        'from_page__tags__css': '#premier p[itemprop=skills] b::text',
    }

    def _get_from_list__publication_datetime(self, job_node):
        return datetime.now() 

    def _get_from_page__publication_datetime(self, job_node):
        date_text = self._extract_first(job_node, 'from_page__publication_datetime')
        if date_text:
            return date_text
        return super(AlsaCreationsSpider, self)._get_from_list__publication_datetime(job_node)

    # def _get_from_page__tags(self, job_node):
    #     # TODO - 2016-02-18 - D.A. - Make tags import ok
    #     # Use the standard tags methods to extract tags (according to base list
    #     tags = self._extract_all(job_node, 'from_page__tags')
    #     if tags:
    #         return tags
    #     return super(AlsaCreationsSpider, self)._get_from_page__tags(job_node)


# N'oubliez pas cette ligne
source = JobSource.from_job_spider(AlsaCreationsSpider)

コード例 #15

0

ファイルを表示

ファイル: afpy.py プロジェクト: BenoitEchernier/crawlers

        'from_page__title__xpath': './h1[@id="parent-fieldname-title"]/text()',
        'from_page__company__xpath': ('.//h4/a/text()', './/h4/text()'),
        'from_page__company_url__xpath': './div[@id="content-core"]/div[@id="content-core"]/h4/a/@href',
        'from_page__address__xpath': './/h4[1]/following-sibling::div[@class="row"]/text()',
        'from_page__description__css': '#content',
        'from_page__tags__xpath': './div[@id="content-core"]/div[@id="content-core"]'
    }

    def _get_from_list__publication_datetime(self, job_container):
        try:
            publication_date_text = self._extract_first(job_container, 'from_list__publication_datetime')
            if publication_date_text:
                publication_date_text_clean = publication_date_text.replace(u'Créé le ', '').replace(u' par', '')
                return datetime.strptime(publication_date_text_clean, '%d/%m/%Y %H:%M')
            return super(AfpyJobSpider, self)._get_from_page__publication_datetime(job_container)
        except Exception, exc:
            self.get_connector().log(
                    self.name,
                    self.ACTION_CRAWL_ERROR,
                    "Error during publication date extraction: %s" % str(exc)
            )
            return super(AfpyJobSpider, self)._get_from_page__publication_datetime(job_container)

    def _get_from_page__description(self, node):
        description = super(AfpyJobSpider, self)._get_from_page__description(node)
        if description:
            return re.sub('<h1[^>]*?>.*?</h1>', '', description)
        return description

source = JobSource.from_job_spider(AfpyJobSpider)

コード例 #16

0

ファイルを表示

ファイル: humancoders.py プロジェクト: pyjobs/crawlers

    label = 'Human coders'
    url = 'http://jobs.humancoders.com/'
    logo_url = 'http://jobs.humancoders.com/assets/logo-b2ddc104507a3e9f623788cf9278ba0e.png'

    _crawl_parameters = {
        'from_page_enabled': True,
        'from_list__jobs_lists__css': 'body',
        'from_list__jobs__css': 'li.job',
        'from_list__url__css': 'div.job_title h2 a::attr(href)',
        'from_list__title__css': 'div.job_title h2 a::text',
        'from_list__publication_datetime__css': 'div.date::text',
        'from_list__tags__css': 'ul.tags li p::text',
        'from_list__company__css': 'div.company span.company_name::text',
        'from_list__address__css': 'div.location::text',
        'from_page__container__css': 'body',
        'from_page__company_url__css': 'div.company_url a::attr(href)',
        'from_page__description__css': '#description'
    }

    def _get_from_list__publication_datetime(self, node):
        raw_date = self._extract_first(node, 'from_list__publication_datetime')
        if raw_date:  # La date est sous la forme "24 août 2015"
            raw_date_english = self._month_french_to_english(
                raw_date)  # On lma converti en Anglais
            return datetime.strptime(
                raw_date_english, '%d %B %Y')  # On extrait la date de ce texte


# N'oubliez pas cette ligne
source = JobSource.from_job_spider(HumanCodersSpider)

コード例 #17

0

ファイルを表示

        'from_list__jobs_lists__css':
        'div#offrescartezone div.result-page table.definition-table',
        'from_list__jobs__css': 'tr[itemtype="http://schema.org/JobPosting"]',
        'from_list__url__css': 'a::attr(href)',
        'from_list__title__css': 'a.title::text',
        'from_list__company__css': 'span.company span[itemprop=name]::text',
        'from_list__next_page__css': None,
        # FIXME - D.A. - 2016-02-19 - next page is protected by javascript
        # This is not a problem for us (we crawl every 15 minutes
        'from_page__container__css': '#offre-body',
        'from_page__title__css': 'h4[itemprop=title]',
        'from_page__publication_datetime__css':
        'span[itemprop=datePosted]::text',
        'from_page__company__css': '#second h3.nom::text',
        'from_page__address__css': 'li[itemprop=addressRegion]::text',
        'from_page__description__css': '#offre-body p[itemprop=description]',
        'from_page__tags__css': 'p[itemprop=description]::text',
    }

    def _get_from_page__publication_datetime(self, job_node):
        date_text = self._extract_first(job_node,
                                        'from_page__publication_datetime')
        if date_text:
            return datetime.strptime(date_text, '%d/%m/%Y')
        return super(PoleEmploiSpider,
                     self)._get_from_page__publication_datetime(job_node)


# N'oubliez pas cette ligne
source = JobSource.from_job_spider(PoleEmploiSpider)

コード例 #18

0

ファイルを表示

ファイル: alsacreations.py プロジェクト: pyjobs/crawlers

        'p.navinfo time::attr(datetime)',
        'from_page__company__css': '#second h3.nom::text',
        'from_page__company_url__css': '#second a[itemprop=url]::attr(href)',
        'from_page__address__css': '#premier b[itemprop=jobLocation]::text',
        'from_page__description__css': '#premier p[itemprop=description]',
        'from_page__tags__css': '#premier p[itemprop=skills] b::text',
    }

    def _get_from_list__publication_datetime(self, job_node):
        return datetime.now()

    def _get_from_page__publication_datetime(self, job_node):
        date_text = self._extract_first(job_node,
                                        'from_page__publication_datetime')
        if date_text:
            return date_text
        return super(AlsaCreationsSpider,
                     self)._get_from_list__publication_datetime(job_node)

    # def _get_from_page__tags(self, job_node):
    #     # TODO - 2016-02-18 - D.A. - Make tags import ok
    #     # Use the standard tags methods to extract tags (according to base list
    #     tags = self._extract_all(job_node, 'from_page__tags')
    #     if tags:
    #         return tags
    #     return super(AlsaCreationsSpider, self)._get_from_page__tags(job_node)


# N'oubliez pas cette ligne
source = JobSource.from_job_spider(AlsaCreationsSpider)