Example #1
0
 def read_configuration(self, config):
     """
     Azoknal az ertekeknel, ami mindenkeppen specifikus, ott a [] operatort
     kell hasznalni, ahol elkepzelheto deafult value, ott a get()-et
     - name: Tudjuk, hogy melyik oldalt szedjuk le
     - all_job_url: az url cim, ahol az osszes munka listazva van
     - all_job_container_html_tag: ha esetleg az oldal kerete valtozna, de
     maga az osszes munka nem, akkor ne scrapeljunk foloslegesen
     - single_job_html_tag: hogy az oldalon levo munkakon vegig tudjunk
     iteralni
     - cache: eltaroljuk a legutobbi scrapelt oldal html contentet, hogy
     csak akkor szedjuk le uj infot, amikor tenyleg valtozott az oldal
     - json: scrapelt adatok, ezek mennek majd tovabb a converternek
     """
     config = config['DEFAULT']
     self.provider_name = config['ProviderName']
     self.base_url = config['BaseUrl']
     self.all_job_url = config['AllJobUrl']
     self.all_job_container_html_element = \
         config['AllJobContainerHtmlElement']
     self.all_job_container_html_class = config['AllJobContainerHtmlClass']
     self.single_job_html_tag = config['SingleJobHtmlTag']
     self.single_job_href_tag = config['SingleJobHrefTag']
     self.cache = os.path.join(
         get_dynamic_parent_folder(self.__class__),
         config.get('Cache', '.cache.html'))
     self.json = os.path.join(
         get_dynamic_parent_folder(self.__class__),
         config.get('JSON', '.scraped_jobs.json'))
     self.job_attrs = {}
Example #2
0
 def __init__(self, config_file_name="scraper.ini"):
     config_file = os.path.join(
         get_dynamic_parent_folder(self.__class__),
         config_file_name)
     config = configparser.ConfigParser()
     config.read(config_file)
     self.read_configuration(config)
 def __init__(self, config_file_name="converter.ini"):
     self.title = None
     self.job_type = None
     self.task = None
     self.place_of_work = None
     self.min_salary = None
     self.max_salary = None
     self.requirements = None
     self.working_hours = None
     self.other = None
     self.url = None
     config_file = os.path.join(
         get_dynamic_parent_folder(self.__class__),
         config_file_name)
     config = configparser.ConfigParser()
     config.read(config_file)
     self.read_configuration(config)
Example #4
0
    def download_and_save_current_html(self):
        """
        Letolti a base url-en talalhato html-t, majd elmenti.

        :return: (elmentett file path, html_content)
        """
        url = urljoin(self.base_url, self.all_job_url)
        current_html = requests.get(url).text
        beatufil_soup = BeautifulSoup(current_html, 'html.parser')
        soup = beatufil_soup.find(
            self.all_job_container_html_element,
            class_=self.all_job_container_html_class
        )
        file_path = os.path.join(
            get_dynamic_parent_folder(self.__class__),'.current.html'
        )
        with open(file_path, 'w') as f:
            f.write(str(soup)
        )
        return file_path, soup
Example #5
0
    def cache_outdated(self):
        """
        Downloads and compares the current state of the job board, and
        compares it to the cache.

        :return: True, if cache is outdated
        """
        eu = EuDiakokScraper
        html = requests.get(eu.base_url).text
        soup = BeautifulSoup(html, 'html.parser')
        jobs = soup.find("div", id="munkakListaContainer")
        current = os.path.join(
            get_dynamic_parent_folder(EuDiakokScraper),
            ".current.html")
        with open(current, "w") as f:
            f.write(str(jobs))
        cache_outdated = True
        if os.path.isfile(self.cache):
            if filecmp.cmp(current, self.cache):
                cache_outdated = False
        else:
            self.update_cache(str(jobs))
        os.remove(current)
        return cache_outdated
    def cache_outdated(self):
        """
        Downloads and compares the current state of the job board, and
        compares it to the cache.

        :return: True, if cache is outdated
        """
        sh = self.__class__
        html = requests.get(sh.budapest_jobs).text
        soup = BeautifulSoup(html, 'html.parser')
        jobs = soup.find("div", class_="categories")
        current = os.path.join(
            get_dynamic_parent_folder(sh),
            ".current.html")
        with open(current, "w") as f:
            f.write(str(jobs))
        cache_outdated = True
        if os.path.isfile(self.cache):
            if filecmp.cmp(current, self.cache):
                cache_outdated = False
        else:
            self.update_cache(str(jobs))
        os.remove(current)
        return cache_outdated
Example #7
0
 def __init__(self, cache_file_name=".cache.html"):
     self.cache = os.path.join(
         get_dynamic_parent_folder(self.__class__),
         cache_file_name)