Esempio n. 1
0
def dump_text_serially(start_url, out_path, dry_run=False):
    next_url_getter = lambda soup: souper.next_url_from_soup_css(
        soup=soup, css="div.order-3 a", base_url="https://www.wisdomlib.org/")

    def html_fixer(soup):
        pass

    def title_maker(soup, title_prefix):
        title = souper.title_from_element(soup=soup,
                                          title_css_selector="h1",
                                          title_prefix=title_prefix)
        # title = regex.sub(" .+/", "", title)
        return title

    dumper = lambda url, outfile_path, title_prefix, dry_run: souper.dump_text_from_element(
        url=url,
        outfile_path=outfile_path,
        text_css_selector="#scontent",
        title_maker=title_maker,
        title_prefix=title_prefix,
        html_fixer=html_fixer,
        dry_run=dry_run)
    souper.dump_series(start_url=start_url,
                       out_path=out_path,
                       dumper=dumper,
                       next_url_getter=next_url_getter,
                       dry_run=dry_run)
Esempio n. 2
0
def dump_text(start_url, out_path, dry_run=False):
    next_url_getter = lambda soup: souper.next_url_from_soup_css(
        soup=soup,
        css="div.gen_header_forelink a",
        base_url="https://sa.wikisource.org/")

    def html_fixer(soup):
        souper.tag_replacer(soup=soup, css_selector="big", tag_name="h2")
        souper.tag_replacer(soup=soup, css_selector="table", tag_name="div")
        souper.tag_replacer(soup=soup, css_selector="tbody", tag_name="div")
        souper.tag_replacer(soup=soup,
                            css_selector="span[style*=\"font-weight:bold;\"]",
                            tag_name="b")
        souper.tag_remover(soup=soup, css_selector=".noprint")

    def title_maker(soup, title_prefix):
        title = souper.title_from_element(soup=soup,
                                          title_css_selector="h1",
                                          title_prefix=title_prefix)
        title = regex.sub(" .+/", "", title)
        return title

    dumper = lambda url, outfile_path, title_prefix, dry_run: souper.dump_text_from_element(
        url=url,
        outfile_path=outfile_path,
        text_css_selector="div.mw-parser-output",
        title_maker=title_maker,
        title_prefix=title_prefix,
        html_fixer=html_fixer,
        dry_run=dry_run)
    souper.dump_series(start_url=start_url,
                       out_path=out_path,
                       dumper=dumper,
                       next_url_getter=next_url_getter,
                       dry_run=dry_run)
Esempio n. 3
0
def dump_item(item_url, outfile_path, title_maker):
    logging.info(item_url)

    def html_fixer(soup):
        souper.tag_replacer(soup=soup, css_selector="table", tag_name="div")
        souper.tag_remover(soup=soup, css_selector="div.view-filters")

    def md_fixer(md):
        md = md.replace("редред", " рее ")
        md = md.replace(".", " - ")
        md = sanscript.transliterate(md, sanscript.IAST, sanscript.DEVANAGARI)
        return md

    souper.dump_text_from_element(url=item_url,
                                  outfile_path=outfile_path,
                                  text_css_selector="div.content",
                                  title_maker=title_maker,
                                  title_prefix="",
                                  html_fixer=html_fixer,
                                  md_fixer=md_fixer,
                                  dry_run=False)
Esempio n. 4
0
def dump_text(start_url,
              out_path,
              base_url="https://sa.wikisource.org/",
              next_url_css="div.gen_header_forelink a",
              transliteration_source=None,
              dry_run=False):
    next_url_getter = lambda soup: souper.next_url_from_soup_css(
        soup=soup, css=next_url_css, base_url=base_url)

    def title_maker(soup, title_prefix):
        title = souper.title_from_element(soup=soup,
                                          title_css_selector="h1",
                                          title_prefix=title_prefix)
        title = regex.sub(" .+/", " ", title).strip()
        return title

    dumper = lambda url, outfile_path, title_prefix, dry_run: souper.dump_text_from_element(
        url=url,
        outfile_path=outfile_path,
        text_css_selector="div.mw-parser-output",
        title_maker=title_maker,
        title_prefix=title_prefix,
        html_fixer=html_fixer,
        dry_run=dry_run)
    souper.dump_series(start_url=start_url,
                       out_path=out_path,
                       dumper=dumper,
                       next_url_getter=next_url_getter,
                       dry_run=dry_run)
    library.fix_index_files(dir_path=out_path,
                            dry_run=dry_run,
                            transliteration_target=transliteration_source,
                            overwrite=True)
    library.set_filenames_from_titles(
        dir_path=out_path,
        dry_run=dry_run,
        transliteration_source=transliteration_source)
                        css_selector="font[size*=\"+1\"]",
                        tag_name="h4")


def title_maker(soup, title_prefix, index):
    if len(soup.select("h1")) > 0:
        title = souper.title_from_element(soup=soup,
                                          title_css_selector="h1",
                                          title_prefix=title_prefix,
                                          index=index)
    else:
        title = souper.title_from_element(soup=soup,
                                          title_css_selector="h2",
                                          title_prefix=title_prefix,
                                          index=index)
    return title


dumper = lambda url, outfile_path, dry_run, index: souper.dump_text_from_element(
    url=url,
    outfile_path=outfile_path,
    text_css_selector="body",
    title_maker=title_maker,
    html_fixer=html_fixer,
    dry_run=dry_run,
    index=index)
souper.markdownify_local_htmls(
    src_dir=
    "/home/vvasuki/sanskrit/raw_etexts_english/local/pritchett/00islamlinks",
    dest_dir="",
    dumper=dumper)