def get_current_openings_page(self, job_page): fetcher = PageFetcher() page_source = fetcher.fetch_page(job_page) soup = BeautifulSoup(page_source, 'html.parser') current_openings_link = soup.find("a", string=re.compile("openings", re.I), href=True) if current_openings_link: return resolve_one_relative_page(self.url, current_openings_link['href']) else: link = soup.find("a", string=re.compile("open positions", re.I), href=True) if link: return resolve_one_relative_page(self.url, link['href']) else: return job_page
def find_posting_links(self, soup): fetcher = PageFetcher() offset = 0 total = 1 links = [] while offset < total: content = fetcher.fetch_page("{url}?offset={offset}".format( url=self.url, offset=offset)) content_json = simplejson.loads(content) for posting in content_json['content']: links.append(posting['ref']) total = content_json['totalFound'] offset += 100 return links
def start(self): self.logger.info("Starting job page finder on {orig}".format(orig=self.url)) fetcher = PageFetcher() page_source = fetcher.fetch_page(self.url) soup = BeautifulSoup(page_source, 'html.parser') try: return self.find_job_pages(soup) except Exception as e: success = False for link in self.find_alternative_pages(soup): self.logger.info("Checking alternative page: {u}".format(u=link)) try: page_source = fetcher.fetch_page(link) alternative_soup = BeautifulSoup(page_source, 'html.parser') pages = self.find_job_pages(alternative_soup) success = True return pages except: continue if not success: raise e
def start(self): self.logger.info( "Starting job page finder on {orig}".format(orig=self.url)) fetcher = PageFetcher() page_source = fetcher.fetch_page(self.url) soup = BeautifulSoup(page_source, 'html.parser') try: return self.find_job_pages(soup) except Exception as e: success = False for link in self.find_alternative_pages(soup): self.logger.info( "Checking alternative page: {u}".format(u=link)) try: page_source = fetcher.fetch_page(link) alternative_soup = BeautifulSoup(page_source, 'html.parser') pages = self.find_job_pages(alternative_soup) success = True return pages except: continue if not success: raise e
def fetch_page(self): fetcher = PageFetcher() job_page_content = fetcher.fetch_page(self.url) soup = BeautifulSoup(job_page_content, 'html.parser') return soup