def fetch(self, entry_url: str) -> JobsList:
        self.jobs = JobsList()
        page_buffer = []
        scheme_host = urlparse(entry_url)
        scheme_host = scheme_host.scheme + '://' + scheme_host.netloc

        intial_page_links=[job_link for job_link in self.get_jobs_list(entry_url)]
        for job_link in intial_page_links:
            job_path = urlparse(job_link).path
            job_link = urljoin(scheme_host, job_path)
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))

        page = 2
        while page_buffer:
            self.jobs.extend(page_buffer)
            page_buffer = []
            loop_url = f'{entry_url}?{self.pagination}={page}'
            current_page_links=[job_link for job_link in self.get_jobs_list(loop_url)]
            if current_page_links==intial_page_links:
                break
            for job_link in current_page_links:
                job_path = urlparse(job_link).path
                job_link = urljoin(scheme_host, job_path)
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            intial_page_links=current_page_links
            page += 1
        return self.jobs
Exemple #2
0
    def fetch(self, entry_url: str):
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 2
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            entry_url += '' if entry_url.endswith('/') else '/'
            loop_url = urljoin(entry_url + 'page/', f'{page}/')

            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs
    def fetch(self, entry_url: str):
        print(entry_url)
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            print(job_link)
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 2
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            loop_url = entry_url.rsplit('/', 1)[0] + f'/page{page}'

            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs
Exemple #4
0
    def fetch(self, entry_url: str) -> JobsList:
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error Processing %s %s " % (job_link, e))

        page = 1
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            prep_url = PreparedRequest()
            prep_url.prepare(url=entry_url, params={'page': page})
            next_page_url = prep_url.url

            for job_link in self.get_jobs_list(next_page_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error Processing %s %s " % (job_link, e))

            print("Scraped page %s" % page)
            page += 1

        return self.jobs
    def fetch(self, entry_url: str):
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 2
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            loop_url = entry_url + (f'&p={page}'
                                    if '?' in entry_url else f'?p={page}')

            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs
Exemple #6
0
    def fetch(self, entry_url: str) -> JobsList:
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(
                    AbstractTokenProvider.get_job(self, job_link))
            except Exception as e:
                print("Error adding job at %s %s" % (job_link, e))
        page = 2
        while page_buffer:
            self.jobs.extend(page_buffer)
            page_buffer = []
            entry_url += '' if entry_url.endswith('/') else '/'
            loop_url = urljoin(entry_url, f'page/{page}/')
            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(
                        AbstractTokenProvider.get_job(self, job_link))
                except Exception as e:
                    print("Error adding job at %s %s" % (job_link, e))
            page += 1

        return self.jobs
Exemple #7
0
    def fetch(self, entry_url: str) -> JobsList:
        self.jobs = JobsList()
        page_buffer = []

        for job_link in self.get_jobs_list(entry_url):
            try:
                page_buffer.append(self.get_job(job_link))
            except:
                print("Error Processing %s "%job_link)

        page = 2
        while len(page_buffer) > 0:
            self.jobs.extend(page_buffer)
            page_buffer = []

            entry_url += '' if entry_url.endswith('/') else '/'
            loop_url = urljoin(entry_url, f'page/{page}/')
            print(loop_url)
            for job_link in self.get_jobs_list(loop_url):
                try:
                    page_buffer.append(self.get_job(job_link))
                except:
                    print("Error Processing %s " % job_link)

            print("Scraped page %s" % page)
            page += 1

        return self.jobs
Exemple #8
0
    def fetch(self, entry_url: str) -> JobsList:
        page_buffer = self.fetch_page(entry_url)
        self.jobs = JobsList()
        page = 2
        while page_buffer:
            self.jobs.extend(page_buffer)
            loop_url = f'{entry_url}?{self.pagination}={page}'
            page_buffer = self.fetch_page(loop_url)
            page += 1

        return self.jobs
 def fetch(self, entry_url: str) -> JobsList:
     self.jobs = JobsList()
     next_url = entry_url
     while next_url:
         content = session.get(next_url, headers=self.headers).content
         for job_url in self.get_urls_from_content(content):
             print(job_url)
             try:
                 self.jobs.append(self.get_job(job_url))
             except Exception as e:
                 print("Error adding job at %s %s" % (job_url, e))
         urls = fromstring(content.decode()).xpath(self.pagination_xpath)
         if urls:
             next_url_element, = urls
             next_url = next_url_element.attrib['href']
             next_url = urljoin(entry_url, next_url)
         else:
             next_url = None
     return self.jobs
 def __init__(self):
     self.jobs = JobsList()