Example #1
0
def get_file_path(out_dir, title_iast, author_iast="", catalog_number=""):
    title_optitrans = sanscript.transliterate(data=title_iast, _from=sanscript.IAST, _to=sanscript.OPTITRANS)
    author_optitrans = sanscript.transliterate(data=author_iast, _from=sanscript.IAST, _to=sanscript.OPTITRANS)
    file_path = "%s_%s_%s.md" % (title_optitrans, author_optitrans, catalog_number.strip())
    file_path = file_helper.clean_file_path(file_path=file_path)
    file_path = os.path.join(out_dir, file_path)
    return file_path
Example #2
0
 def set_filename_from_title(self, transliteration_source, dry_run):
     # logging.debug(self.file_path)
     if str(self.file_path).endswith("_index.md"):
         logging.info("Special file %s. Skipping." % self.file_path)
         return
     title = self.get_title(omit_chapter_id=False)
     if transliteration_source is not None:
         title = sanscript.transliterate(data=title,
                                         _from=transliteration_source,
                                         _to=sanscript.OPTITRANS)
     if os.path.basename(self.file_path) == "_index.md":
         current_path = os.path.dirname(self.file_path)
         extension = ""
     else:
         current_path = self.file_path
         extension = ".md"
     file_name = title.strip()
     file_name = regex.sub("[ _.]+", "_", file_name)
     file_name = regex.sub("-+", "-", file_name)
     file_name = file_name + extension
     file_name = file_helper.clean_file_path(file_name)
     file_path = os.path.join(os.path.dirname(current_path), file_name)
     if str(current_path) != file_path:
         logging.info("Renaming %s to %s", current_path, file_path)
         if not dry_run:
             os.rename(src=current_path, dst=file_path)
     self.file_path = file_path
Example #3
0
def markdownify_local_htmls(src_dir, dest_dir, dumper, dry_run=False):
    file_paths = sorted(Path(src_dir).glob("**/*.htm*"))
    for index, src_path in enumerate(file_paths):
        dest_path = str(src_path).replace(".html", ".md").replace(
            ".htm", ".md").replace(src_dir, dest_dir)
        dest_path = file_helper.clean_file_path(dest_path)
        _ = dumper(url="file://" + str(src_path),
                   outfile_path=dest_path,
                   index=index,
                   dry_run=dry_run)
def markdownify_all(src_dir, dest_dir):
    file_paths = sorted(Path(src_dir).glob("**/doc_*/*.html"))
    for src_path in file_paths:
        metadata = get_metadata(src_file=src_path)
        if metadata == {}:
            logging.warning("No metadata found for %s", src_path)
            continue
        filename = metadata["itxtitle"].strip() + ".md"
        dest_path = os.path.join(
            os.path.dirname(str(src_path).replace(src_dir, dest_dir)),
            filename)
        dest_path = file_helper.clean_file_path(dest_path)
        dump_markdown(src_file=src_path, dest_file=dest_path)
def dump_all_texts(dest_dir, overwrite=False):
    soup = scraping.get_soup(url="https://adishila.com/unicodetxt-htm/")
    links = soup.select("div.wp-block-group a")
    for link in links:
        (title, text) = get_text(link["href"])
        filename = file_helper.clean_file_path("%s.md" % title)
        dest_path = os.path.join(dest_dir, filename)
        if not overwrite and os.path.exists(dest_path):
            logging.warning("Skipping %s since it exists", dest_path)
            continue
        logging.info("Getting %s", link["href"])
        md_file = MdFile(file_path=dest_path, frontmatter_type=MdFile.TOML)
        md_file.dump_to_file(metadata={"title": title}, md=text, dry_run=False)
Example #6
0
def get_filename(source_html):
    logging.info("Getting file name for %s", source_html)
    with codecs.open(source_html, "r", 'utf-8') as file_in:
        contents = file_in.read()
        soup = BeautifulSoup(contents, 'lxml')
        title = soup.title.string
        if title == None:
            return os.path.basename(source_html).replace("htm", "md")
        title = sanscript.transliterate(title,
                                        _from=sanscript.IAST,
                                        _to=sanscript.OPTITRANS)
        filename = "%s.md" % title.strip()
        return file_helper.clean_file_path(filename.strip())
Example #7
0
def import_md_recursive(source_dir, file_extension, source_format=None, dry_run=False):
  from pathlib import Path
  # logging.debug(list(Path(dir_path).glob(file_pattern)))
  source_paths = sorted(Path(source_dir).glob("**/*." + file_extension))
  if source_format is None:
    source_format = file_extension
  for source_path in source_paths:
    md_path = str(source_path).replace("." + file_extension, ".md")
    md_path = file_helper.clean_file_path(md_path)
    if os.path.exists(md_path):
      logging.info("Skipping %s", md_path)
      continue
    logging.info("Processing %s to %s", source_path, md_path)
    md_file = MdFile(file_path=md_path, frontmatter_type=MdFile.TOML)
    md_file.import_with_pandoc(source_file=source_path, source_format="rtf", dry_run=dry_run)
Example #8
0
def scrape_post_markdown(url, dir_path, dry_run):
    #construct file_name from the posts url
    parsed_url = urlsplit(url=url)
    file_name = (parsed_url.path).strip()
    #remove slashes, replace with dashes when dealing with urls like https://manasataramgini.wordpress.com/2020/06/08/pandemic-days-the-fizz-is-out-of-the-bottle/
    file_name = regex.sub("/(....)/(..)/(..)/(.+)/", r"\1-\2-\3_\4.md", file_name)
    file_path = file_helper.clean_file_path(file_path=os.path.join(dir_path, file_name))

    if os.path.exists(file_path):
        logging.warning("Skipping %s : exists", file_name)
        return 
    (title, post_html) = get_post_html(url=url)
    logging.info("Dumping %s to %s with title %s.", url, file_path, title)

    md_file = MdFile(file_path=file_path, frontmatter_type=MdFile.TOML)
    md_file.import_content_with_pandoc(metadata={"title": title}, content=post_html, source_format="html", dry_run=dry_run)
Example #9
0
  def split_to_bits(self, source_script=sanscript.DEVANAGARI, indexed_title_pattern="%02d %s",
                    target_frontmantter_type=TOML, dry_run=False):
    """
    
    Implementation notes: md parsers oft convert to html or json. Processing that output would be more complicated than what we need here.
    :return: 
    """
    # TODO: Fix links upon splitting.
    logging.debug("Processing file: %s", self.file_path)
    if os.path.basename(self.file_path) == "_index.md":
      out_dir = os.path.dirname(self.file_path)
    else:
      out_dir = os.path.join(os.path.dirname(self.file_path), os.path.basename(self.file_path).replace(".md", ""))
    (metadata, md) = self.read_md_file()
    lines = md.splitlines(keepends=False)
    (lines_till_section, remaining) = get_lines_till_section(lines)
    sections = split_to_sections(remaining)
    for section_index, (title, section_lines) in enumerate(sections):
      if indexed_title_pattern is not None:
        title = indexed_title_pattern % (section_index + 1, title)
        if source_script is not None:
          title = sanscript.transliterate(title, sanscript.OPTITRANS, source_script)
      title_in_file_name = title
      if source_script is not None:
        title_in_file_name = sanscript.transliterate(title, source_script, sanscript.OPTITRANS)
      if title_in_file_name == "":
        raise ValueError(title_in_file_name)
      file_name = file_helper.clean_file_path("%s.md" % title_in_file_name)
      file_path = os.path.join(out_dir, file_name)
      section_yml = {"title": title}
      section_md = "\n".join(reduce_section_depth(section_lines))
      md_file = MdFile(file_path=file_path, frontmatter_type=target_frontmantter_type)
      md_file.dump_to_file(metadata=section_yml, md=section_md, dry_run=dry_run)

    remainder_file_path = os.path.join(out_dir, "_index.md")
    md = "\n".join(lines_till_section)
    logging.debug(metadata)
    if not metadata["title"].startswith("+"):
      metadata["title"] = "+" + metadata["title"]
    MdFile(file_path=remainder_file_path, frontmatter_type=target_frontmantter_type).dump_to_file(metadata=metadata,
                                                                                                  md=md,
                                                                                                  dry_run=dry_run)
    if str(self.file_path) != str(remainder_file_path):
      logging.info("Removing %s as %s is different ", self.file_path, remainder_file_path)
      if not dry_run:
        os.remove(path=self.file_path)
def get_output_path(text_name, outdir):
    text_name_transliterated = sanscript.transliterate(data=text_name, _from=sanscript.DEVANAGARI, _to=sanscript.OPTITRANS)
    return os.path.join(outdir, file_helper.clean_file_path(text_name_transliterated)  + ".md")