コード例 #1
0
def dump_doc(url, out_dir, index, dry_run=False):
    out_file_path = regex.sub("/([^/]+).htm", "/%03d_\\1.md" % index, url)
    out_file_path = os.path.join(out_dir, out_file_path)
    if os.path.exists(out_file_path):
        logging.info("Skipping Dumping %s to %s", url, out_file_path)
        return
    logging.info("Dumping %s to %s", url, out_file_path)
    full_url = "http://www.ramakrishnavivekananda.info/vivekananda/" + url
    soup = scraping.get_soup(full_url)
    metadata = {}
    title_elements = soup.select("h2")
    if len(title_elements) > 0:
        metadata["title"] = title_elements[0].text
    else:
        metadata["title"] = regex.sub("/([^/]+).htm", "\\1",
                                      url).replace("_", " ")
    body_element = soup.select("body")
    if len(body_element) == 0:
        logging.warning("Could not get text form %s with soup", full_url)
        filehandle = urllib.request.urlopen(full_url)
        content = filehandle.read().decode("utf8")
        filehandle.close()
    else:
        content = body_element[0].decode_contents()
    md_file = md_helper.MdFile(file_path=out_file_path)
    md_file.import_content_with_pandoc(content=content,
                                       source_format="html",
                                       dry_run=dry_run,
                                       metadata=metadata)
コード例 #2
0
def get_text(url):
    logging.info("Processing %s", url)
    soup = scraping.get_soup(url=url)
    content = soup.select("pre")[0].text
    content = sanscript.transliterate(data=content,
                                      _from=sanscript.IAST,
                                      _to=sanscript.DEVANAGARI)
    return content
コード例 #3
0
def dump_docs(out_dir, dry_run=False):
    index_url = "https://www.ramakrishnavivekananda.info/vivekananda/master_index.htm"
    soup = scraping.get_soup(index_url)
    links = soup.select("a")
    for index, link in enumerate(links[3:]):
        dump_doc(url=link["href"],
                 out_dir=out_dir,
                 index=index,
                 dry_run=dry_run)
コード例 #4
0
def get_docs(out_dir):
    soup = scraping.get_soup(
        "https://etexts.muktabodha.org/DL_CATALOG_USER_INTERFACE/dl_user_interface_list_catalog_records.php?sort_key=title"
    )
    links = soup.select("a")
    for link in links:
        url = "https://etexts.muktabodha.org/DL_CATALOG_USER_INTERFACE/%s" % link[
            "href"]
        process_catalog_page_selenium(url=url, out_dir=out_dir)
コード例 #5
0
ファイル: souper.py プロジェクト: anirudh2290/doc_curation
def get_html(url):
    soup = scraping.get_soup(url)
    body_element = soup.select("body")
    if len(body_element) == 0:
        logging.warning("Could not get text form %s with soup", url)
        filehandle = urllib.request.urlopen(url)
        content = filehandle.read().decode("utf8")
        filehandle.close()
    else:
        content = body_element[0].decode_contents()
    return content
コード例 #6
0
def dump_all_texts(dest_dir, overwrite=False):
    soup = scraping.get_soup(url="https://adishila.com/unicodetxt-htm/")
    links = soup.select("div.wp-block-group a")
    for link in links:
        (title, text) = get_text(link["href"])
        filename = file_helper.clean_file_path("%s.md" % title)
        dest_path = os.path.join(dest_dir, filename)
        if not overwrite and os.path.exists(dest_path):
            logging.warning("Skipping %s since it exists", dest_path)
            continue
        logging.info("Getting %s", link["href"])
        md_file = MdFile(file_path=dest_path, frontmatter_type=MdFile.TOML)
        md_file.dump_to_file(metadata={"title": title}, md=text, dry_run=False)
コード例 #7
0
def dump_doc(url, out_dir, index, dry_run=False):
    out_file_path = regex.sub(".+/([^/]+).html?", "%02d_\\1.md" %index, url)
    out_file_path = os.path.join(out_dir, out_file_path)
    if os.path.exists(out_file_path):
        logging.info("Skipping Dumping %s to %s", url, out_file_path)
        return
    logging.info("Dumping %s to %s", url, out_file_path)
    soup = scraping.get_soup(url)
    if "Not Found" in soup.text:
        logging.warning("%s not found!", url)
        return
    metadata = get_metadata(soup=soup, index=index, url=url)
    dump_content(soup=soup, out_file_path=out_file_path, metadata=metadata, dry_run=dry_run)
コード例 #8
0
def dump_docs(index_url, out_dir, dry_run=False):
    soup = scraping.get_soup(index_url)
    out_file_path = os.path.join(out_dir, "_index.md")
    if not os.path.exists(out_file_path):
        dump_content(soup=soup, out_file_path=out_file_path, metadata={}, dry_run=dry_run)

    links = soup.select("a")
    for index, link in enumerate(links):
        href = link.get("href", None)
        text = fix_text(link.text)
        if href and "Back to" not in text and href not in ["http://voiceofdharma.org"]:
            if href.startswith("http"):
                url = link["href"]
            else:
                url = index_url + link["href"]
            dump_doc(url= url, out_dir=out_dir, index=index, dry_run=dry_run)
コード例 #9
0
def get_text(url):
    soup = scraping.get_soup(url=url)
    text = soup.select_one("div.entry-content").text
    text = md_helper.markdownify_plain_text(text)
    title = regex.sub("[ -]*आदिशिला", "", soup.title.string).strip()
    return (title, text)
コード例 #10
0
def process_catalog_page_soup(url):
    """Does not work - get template content which is different from actual view in browser."""
    soup = scraping.get_soup(url=url)