def scrape(self, **kwargs): driver, user_agent = get_driver() url = self.base_url current = True if 'page' in kwargs: current = False page = kwargs.get('page') url = os.path.join(url, str(page)) docker_env = is_dot_docker_env_there() if docker_env: response = requests.get(url=url) else: headers = {'User-Agent': user_agent} response = requests.get(url=url, headers=headers) if len(response.history) == 1: #raise Exception("No data retrieved") raise EmptyNewsPageException(page) driver.get(url) titles = self.__get_titles(driver) dates = self.__get_dates(driver) # process dates before push them to backend norm_dates = [normalize_date(date) for date in dates] dates = norm_dates sources = self.__get_sources(driver) hashes = [hashlib.md5(title.encode()).hexdigest() for title in titles] chronos = [datetime.now().strftime("%m/%d/%Y, %H:%M:%S")] * len(hashes) # return data as dict scraped_data = [] for data in zip(hashes, chronos, titles, sources, dates): s = dict() s['header_hash'], s['scraping_date'], s['new_header'], s[ 'source'], s['public_date'] = data scraped_data.append(s) return scraped_data
def _decorate_file_entry(cls, entry) -> tuple: """ Decorates given entry for a file. By decorate it means that creates a colored representation of a name of the entry, grabs the date it was last modified and size in bytes and decorates, determines file type. collects everything and returns as a list. """ # Gives yellow color to the string & truncate to 32 chars current = [stylize("» " + entry.name[:33], fg(226))] # Convert last modified time (which is in nanoseconds) date = os.stat(entry).st_mtime current.append( DecoratedData(date, normalize_date('%h %d %Y %H:%M', date))) b = os.stat(entry).st_size current.append(DecoratedData(b, bytes_to_human_readable(b))) # Evaluate the file type current.append(magic.from_file(entry.path, mime=True)) return tuple(current)
def _decorate_dir_entry(cls, entry) -> tuple: """ Decorates given entry for a directory. Decorate means that creates a colored representation of a name of the entry, grabs the date it was last modified and size in bytes and decorates. collects everything and returns as a list. """ # Gives orange color to the string & truncate to 32 chars current = [stylize("■ " + entry.name[:33] + "/", fg(202))] # Get date and convert in to a human readable format date = os.stat(entry).st_mtime current.append( DecoratedData(date, normalize_date('%h %d %Y %H:%M', date))) # recursively calculates the total size of a folder b = DirectoryFiles().get_dir_size(entry) current.append(DecoratedData(b, bytes_to_human_readable(b))) current.append('-') # add directory type identifier return tuple(current)