def preprocess(cls, virtual_fs: VirtualFS, root: Directory): settings.logger.info("Removing dull notion.so table files..") for dir in root.walk_dirs(): if dir.filename.startswith( "Interesting_articles") and dir.parent is root: cls._empty_directory(dir) for page in root.walk_htmls(): page_body_tags = page.dom.find("div", {"class": "page-body"}) if page_body_tags and not page_body_tags[0].getContent().strip(): page.parent.files.remove(page)
def preprocess(cls, virtual_fs: VirtualFS, root: Directory): settings.logger.info("Unfucking filenames..") for item in root.walk_htmls(): if item.filename == "index.html": continue title = item.title if "://" in title: continue new_filename = cls._normalize_fn(title) new_filename = cls._trim_long_filenames(new_filename) new_filename = cls._make_sure_filename_is_unique(item, new_filename) if new_filename: cls._rename_also_section_dirs(item, new_filename) item.filename = new_filename + ".html" for item in root.walk_dirs(): new_filename = cls._unfuck_filename(item.filename) item.filename = cls._make_sure_filename_is_unique(item, new_filename)