def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() page_buffer = [] scheme_host = urlparse(entry_url) scheme_host = scheme_host.scheme + '://' + scheme_host.netloc intial_page_links=[job_link for job_link in self.get_jobs_list(entry_url)] for job_link in intial_page_links: job_path = urlparse(job_link).path job_link = urljoin(scheme_host, job_path) try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while page_buffer: self.jobs.extend(page_buffer) page_buffer = [] loop_url = f'{entry_url}?{self.pagination}={page}' current_page_links=[job_link for job_link in self.get_jobs_list(loop_url)] if current_page_links==intial_page_links: break for job_link in current_page_links: job_path = urlparse(job_link).path job_link = urljoin(scheme_host, job_path) try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) intial_page_links=current_page_links page += 1 return self.jobs
def fetch(self, entry_url: str): self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] entry_url += '' if entry_url.endswith('/') else '/' loop_url = urljoin(entry_url + 'page/', f'{page}/') for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs
def fetch(self, entry_url: str): print(entry_url) self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): print(job_link) try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] loop_url = entry_url.rsplit('/', 1)[0] + f'/page{page}' for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs
def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error Processing %s %s " % (job_link, e)) page = 1 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] prep_url = PreparedRequest() prep_url.prepare(url=entry_url, params={'page': page}) next_page_url = prep_url.url for job_link in self.get_jobs_list(next_page_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error Processing %s %s " % (job_link, e)) print("Scraped page %s" % page) page += 1 return self.jobs
def fetch(self, entry_url: str): self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] loop_url = entry_url + (f'&p={page}' if '?' in entry_url else f'?p={page}') for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs
def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append( AbstractTokenProvider.get_job(self, job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page = 2 while page_buffer: self.jobs.extend(page_buffer) page_buffer = [] entry_url += '' if entry_url.endswith('/') else '/' loop_url = urljoin(entry_url, f'page/{page}/') for job_link in self.get_jobs_list(loop_url): try: page_buffer.append( AbstractTokenProvider.get_job(self, job_link)) except Exception as e: print("Error adding job at %s %s" % (job_link, e)) page += 1 return self.jobs
def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() page_buffer = [] for job_link in self.get_jobs_list(entry_url): try: page_buffer.append(self.get_job(job_link)) except: print("Error Processing %s "%job_link) page = 2 while len(page_buffer) > 0: self.jobs.extend(page_buffer) page_buffer = [] entry_url += '' if entry_url.endswith('/') else '/' loop_url = urljoin(entry_url, f'page/{page}/') print(loop_url) for job_link in self.get_jobs_list(loop_url): try: page_buffer.append(self.get_job(job_link)) except: print("Error Processing %s " % job_link) print("Scraped page %s" % page) page += 1 return self.jobs
def fetch(self, entry_url: str) -> JobsList: page_buffer = self.fetch_page(entry_url) self.jobs = JobsList() page = 2 while page_buffer: self.jobs.extend(page_buffer) loop_url = f'{entry_url}?{self.pagination}={page}' page_buffer = self.fetch_page(loop_url) page += 1 return self.jobs
def fetch(self, entry_url: str) -> JobsList: self.jobs = JobsList() next_url = entry_url while next_url: content = session.get(next_url, headers=self.headers).content for job_url in self.get_urls_from_content(content): print(job_url) try: self.jobs.append(self.get_job(job_url)) except Exception as e: print("Error adding job at %s %s" % (job_url, e)) urls = fromstring(content.decode()).xpath(self.pagination_xpath) if urls: next_url_element, = urls next_url = next_url_element.attrib['href'] next_url = urljoin(entry_url, next_url) else: next_url = None return self.jobs
def __init__(self): self.jobs = JobsList()