def get_file_path(out_dir, title_iast, author_iast="", catalog_number=""): title_optitrans = sanscript.transliterate(data=title_iast, _from=sanscript.IAST, _to=sanscript.OPTITRANS) author_optitrans = sanscript.transliterate(data=author_iast, _from=sanscript.IAST, _to=sanscript.OPTITRANS) file_path = "%s_%s_%s.md" % (title_optitrans, author_optitrans, catalog_number.strip()) file_path = file_helper.clean_file_path(file_path=file_path) file_path = os.path.join(out_dir, file_path) return file_path
def set_filename_from_title(self, transliteration_source, dry_run): # logging.debug(self.file_path) if str(self.file_path).endswith("_index.md"): logging.info("Special file %s. Skipping." % self.file_path) return title = self.get_title(omit_chapter_id=False) if transliteration_source is not None: title = sanscript.transliterate(data=title, _from=transliteration_source, _to=sanscript.OPTITRANS) if os.path.basename(self.file_path) == "_index.md": current_path = os.path.dirname(self.file_path) extension = "" else: current_path = self.file_path extension = ".md" file_name = title.strip() file_name = regex.sub("[ _.]+", "_", file_name) file_name = regex.sub("-+", "-", file_name) file_name = file_name + extension file_name = file_helper.clean_file_path(file_name) file_path = os.path.join(os.path.dirname(current_path), file_name) if str(current_path) != file_path: logging.info("Renaming %s to %s", current_path, file_path) if not dry_run: os.rename(src=current_path, dst=file_path) self.file_path = file_path
def markdownify_local_htmls(src_dir, dest_dir, dumper, dry_run=False): file_paths = sorted(Path(src_dir).glob("**/*.htm*")) for index, src_path in enumerate(file_paths): dest_path = str(src_path).replace(".html", ".md").replace( ".htm", ".md").replace(src_dir, dest_dir) dest_path = file_helper.clean_file_path(dest_path) _ = dumper(url="file://" + str(src_path), outfile_path=dest_path, index=index, dry_run=dry_run)
def markdownify_all(src_dir, dest_dir): file_paths = sorted(Path(src_dir).glob("**/doc_*/*.html")) for src_path in file_paths: metadata = get_metadata(src_file=src_path) if metadata == {}: logging.warning("No metadata found for %s", src_path) continue filename = metadata["itxtitle"].strip() + ".md" dest_path = os.path.join( os.path.dirname(str(src_path).replace(src_dir, dest_dir)), filename) dest_path = file_helper.clean_file_path(dest_path) dump_markdown(src_file=src_path, dest_file=dest_path)
def dump_all_texts(dest_dir, overwrite=False): soup = scraping.get_soup(url="https://adishila.com/unicodetxt-htm/") links = soup.select("div.wp-block-group a") for link in links: (title, text) = get_text(link["href"]) filename = file_helper.clean_file_path("%s.md" % title) dest_path = os.path.join(dest_dir, filename) if not overwrite and os.path.exists(dest_path): logging.warning("Skipping %s since it exists", dest_path) continue logging.info("Getting %s", link["href"]) md_file = MdFile(file_path=dest_path, frontmatter_type=MdFile.TOML) md_file.dump_to_file(metadata={"title": title}, md=text, dry_run=False)
def get_filename(source_html): logging.info("Getting file name for %s", source_html) with codecs.open(source_html, "r", 'utf-8') as file_in: contents = file_in.read() soup = BeautifulSoup(contents, 'lxml') title = soup.title.string if title == None: return os.path.basename(source_html).replace("htm", "md") title = sanscript.transliterate(title, _from=sanscript.IAST, _to=sanscript.OPTITRANS) filename = "%s.md" % title.strip() return file_helper.clean_file_path(filename.strip())
def import_md_recursive(source_dir, file_extension, source_format=None, dry_run=False): from pathlib import Path # logging.debug(list(Path(dir_path).glob(file_pattern))) source_paths = sorted(Path(source_dir).glob("**/*." + file_extension)) if source_format is None: source_format = file_extension for source_path in source_paths: md_path = str(source_path).replace("." + file_extension, ".md") md_path = file_helper.clean_file_path(md_path) if os.path.exists(md_path): logging.info("Skipping %s", md_path) continue logging.info("Processing %s to %s", source_path, md_path) md_file = MdFile(file_path=md_path, frontmatter_type=MdFile.TOML) md_file.import_with_pandoc(source_file=source_path, source_format="rtf", dry_run=dry_run)
def scrape_post_markdown(url, dir_path, dry_run): #construct file_name from the posts url parsed_url = urlsplit(url=url) file_name = (parsed_url.path).strip() #remove slashes, replace with dashes when dealing with urls like https://manasataramgini.wordpress.com/2020/06/08/pandemic-days-the-fizz-is-out-of-the-bottle/ file_name = regex.sub("/(....)/(..)/(..)/(.+)/", r"\1-\2-\3_\4.md", file_name) file_path = file_helper.clean_file_path(file_path=os.path.join(dir_path, file_name)) if os.path.exists(file_path): logging.warning("Skipping %s : exists", file_name) return (title, post_html) = get_post_html(url=url) logging.info("Dumping %s to %s with title %s.", url, file_path, title) md_file = MdFile(file_path=file_path, frontmatter_type=MdFile.TOML) md_file.import_content_with_pandoc(metadata={"title": title}, content=post_html, source_format="html", dry_run=dry_run)
def split_to_bits(self, source_script=sanscript.DEVANAGARI, indexed_title_pattern="%02d %s", target_frontmantter_type=TOML, dry_run=False): """ Implementation notes: md parsers oft convert to html or json. Processing that output would be more complicated than what we need here. :return: """ # TODO: Fix links upon splitting. logging.debug("Processing file: %s", self.file_path) if os.path.basename(self.file_path) == "_index.md": out_dir = os.path.dirname(self.file_path) else: out_dir = os.path.join(os.path.dirname(self.file_path), os.path.basename(self.file_path).replace(".md", "")) (metadata, md) = self.read_md_file() lines = md.splitlines(keepends=False) (lines_till_section, remaining) = get_lines_till_section(lines) sections = split_to_sections(remaining) for section_index, (title, section_lines) in enumerate(sections): if indexed_title_pattern is not None: title = indexed_title_pattern % (section_index + 1, title) if source_script is not None: title = sanscript.transliterate(title, sanscript.OPTITRANS, source_script) title_in_file_name = title if source_script is not None: title_in_file_name = sanscript.transliterate(title, source_script, sanscript.OPTITRANS) if title_in_file_name == "": raise ValueError(title_in_file_name) file_name = file_helper.clean_file_path("%s.md" % title_in_file_name) file_path = os.path.join(out_dir, file_name) section_yml = {"title": title} section_md = "\n".join(reduce_section_depth(section_lines)) md_file = MdFile(file_path=file_path, frontmatter_type=target_frontmantter_type) md_file.dump_to_file(metadata=section_yml, md=section_md, dry_run=dry_run) remainder_file_path = os.path.join(out_dir, "_index.md") md = "\n".join(lines_till_section) logging.debug(metadata) if not metadata["title"].startswith("+"): metadata["title"] = "+" + metadata["title"] MdFile(file_path=remainder_file_path, frontmatter_type=target_frontmantter_type).dump_to_file(metadata=metadata, md=md, dry_run=dry_run) if str(self.file_path) != str(remainder_file_path): logging.info("Removing %s as %s is different ", self.file_path, remainder_file_path) if not dry_run: os.remove(path=self.file_path)
def get_output_path(text_name, outdir): text_name_transliterated = sanscript.transliterate(data=text_name, _from=sanscript.DEVANAGARI, _to=sanscript.OPTITRANS) return os.path.join(outdir, file_helper.clean_file_path(text_name_transliterated) + ".md")