def scrape(self, **kwargs):
     driver, user_agent = get_driver()
     url = self.base_url
     current = True
     if 'page' in kwargs:
         current = False
         page = kwargs.get('page')
         url = os.path.join(url, str(page))
         docker_env = is_dot_docker_env_there()
         if docker_env:
             response = requests.get(url=url)
         else:
             headers = {'User-Agent': user_agent}
             response = requests.get(url=url, headers=headers)
         if len(response.history) == 1:
             #raise Exception("No data retrieved")
             raise EmptyNewsPageException(page)
     driver.get(url)
     titles = self.__get_titles(driver)
     dates = self.__get_dates(driver)
     # process dates before push them to backend
     norm_dates = [normalize_date(date) for date in dates]
     dates = norm_dates
     sources = self.__get_sources(driver)
     hashes = [hashlib.md5(title.encode()).hexdigest() for title in titles]
     chronos = [datetime.now().strftime("%m/%d/%Y, %H:%M:%S")] * len(hashes)
     # return data as dict
     scraped_data = []
     for data in zip(hashes, chronos, titles, sources, dates):
         s = dict()
         s['header_hash'], s['scraping_date'], s['new_header'], s[
             'source'], s['public_date'] = data
         scraped_data.append(s)
     return scraped_data
Exemple #2
0
    def _decorate_file_entry(cls, entry) -> tuple:
        """
        Decorates given entry for a file. By decorate it means that creates
        a colored representation of a name of the entry, grabs 
        the date it was last modified and size in bytes and decorates,
        determines file type. collects everything and returns as a list.
        """

        # Gives yellow color to the string & truncate to 32 chars
        current = [stylize("» " + entry.name[:33], fg(226))]

        # Convert last modified time (which is in nanoseconds)
        date = os.stat(entry).st_mtime
        current.append(
            DecoratedData(date, normalize_date('%h %d %Y %H:%M', date)))

        b = os.stat(entry).st_size
        current.append(DecoratedData(b, bytes_to_human_readable(b)))

        # Evaluate the file type
        current.append(magic.from_file(entry.path, mime=True))
        return tuple(current)
Exemple #3
0
    def _decorate_dir_entry(cls, entry) -> tuple:
        """
        Decorates given entry for a directory. Decorate means that creates 
        a colored representation of a name of the entry, grabs 
        the date it was last modified and size in bytes and decorates.
        collects everything and returns as a list.
        """

        # Gives orange color to the string & truncate to 32 chars
        current = [stylize("■ " + entry.name[:33] + "/", fg(202))]

        # Get date and convert in to a human readable format
        date = os.stat(entry).st_mtime
        current.append(
            DecoratedData(date, normalize_date('%h %d %Y %H:%M', date)))

        # recursively calculates the total size of a folder
        b = DirectoryFiles().get_dir_size(entry)
        current.append(DecoratedData(b, bytes_to_human_readable(b)))

        current.append('-')  # add directory type identifier
        return tuple(current)