def __init__(self, state, base_output_path, current_time, file_format, content=None): self.state = state self.regex = STATION_REGEX url = STATE_URL % state self.content = content if content else utils.download_content(url) super(StateParser, self).__init__( base_output_path, current_time, file_format )
def download_chapters_list(self): html = download_content(self.index_iri) page_prefix = self.index_iri.replace("all.html", "") search = re.findall(r'<a style="" href=".*?<\/a>', html) if search: for title_html in search: self.chapter_list.append(Chapter(title_html, page_prefix)) else: logging.error( f'Failed to fetch this books chapter list from {self.index_iri}' ) # TODO delete this line self.chapter_list = self.chapter_list[2088:] return search
def download_chapters_content(self): # To avoid being detected as a script(?) random.seed() #Create directory where to save chapters Path(f"{self.title}").mkdir(parents=True, exist_ok=True) abs_file_path = os.path.abspath(f'.\\{self.title}') for chapter in self.chapter_list: chapter.process_raw_title() logging.info(f'Downloading {chapter.title}') print(f'Downloading {chapter.title}') chapter.raw_content = download_content(chapter.link) chapter.extract_chapter() chapter.clean_chapter() chapter.save_chapter(abs_file_path) time.sleep(random.randrange(5))
def __init__(self, base_output_path, current_time, file_format, content=None): """ Initialise the MainParser. :param base_output_path: base path to store files. :param current_time: current time to use for file names. :param file_format: format to store the files. :param content: content of the main page. """ self.regex = STATE_REGEX self.content = content if content else utils.download_content(MAIN_URL) super(MainParser, self).__init__( base_output_path, current_time, file_format )
def download_data(self): for station in self.get_match(): logger.debug("Processing station %s", station) url = STATION_URL % (station, self.state, station) filename = self._get_station_filename(station) utils.download_content(url, filename)