Exemple #1
0
def get_text(text_id, base_dir):
    unit_info_file = os.path.join(os.path.dirname(text_data.__file__),
                                  "mahaabhaarata/kumbhakonam.json")
    if text_id == "BORI":
        unit_info_file = os.path.join(os.path.dirname(text_data.__file__),
                                      "mahaabhaarata/bori.json")
    else:
        unit_info_file = os.path.join(os.path.dirname(text_data.__file__),
                                      "mahaabhaarata/kumbhakonam.json")

    for book_index in text_data.get_subunit_list(json_file=unit_info_file,
                                                 unit_path_list=[]):
        book_index = "%02d" % book_index
        chapter_list = text_data.get_subunit_list(json_file=unit_info_file,
                                                  unit_path_list=[book_index])
        book_data = text_data.get_subunit_data(json_file=unit_info_file,
                                               unit_path_list=[book_index])

        for chapter_index in chapter_list:
            infile_path = "http://mahabharata.manipal.edu/browse/%s/%s/%d.txt" % (
                book_data["alt_title"].lower(), text_id, chapter_index)
            outfile_path = os.path.join(base_dir, str(book_index),
                                        "%03d.md" % chapter_index)
            logging.info("Book %s chapter %d url: %s outpath: %s", book_index,
                         chapter_index, infile_path, outfile_path)
            if os.path.exists(outfile_path):
                logging.info("Skipping " + outfile_path)
                continue

            os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True)
            with open(outfile_path, "w") as outfile:
                resource = urllib.request.urlopen(infile_path)
                content = resource.read().decode("utf-8")
                outfile.writelines([content])
Exemple #2
0
def dump_text(base_dir, do_transliteration=False):
    unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "shatapatha.json")

    titus_url = "http://titus.uni-frankfurt.de/texte/etcs/ind/aind/ved/yvw/sbm/sbm.htm"
    for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]):
        sarga_list = text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[kaanda_index])
        for sarga_index in sarga_list:
            logging.info("kaanDa %d adhyaaya %d", kaanda_index, sarga_index)

            outfile_path = os.path.join(base_dir, "%02d" % (kaanda_index), "%02d" % sarga_index + ".md")
            if os.path.exists(outfile_path):
                logging.info("Skipping " + outfile_path)
                continue

            titus.navigate_to_part(base_page_url=titus_url, level_3_id=kaanda_index, level_4_id=sarga_index)
            sentences = titus.get_text()
            lines = ["\n"]
            for sentence in sentences:
                sentence = roman.RomanScheme.simplify_accent_notation(sentence)
                sentence = sentence.replace("/", ".")
                if not sentence.endswith("."):
                    sentence = sentence + ".."
                if do_transliteration:
                    if kaanda_index == 12:
                        sentence = sanscript.transliterate(sentence, sanscript.IAST, sanscript.DEVANAGARI)
                    else:
                        sentence = sanscript.transliterate(sentence, sanscript.TITUS, sanscript.DEVANAGARI)
                    sentence = roman.RomanScheme.to_shatapatha_svara(sentence)
                lines.append(sentence + "  \n")
            os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True)
            with open(outfile_path, "w") as outfile:
                outfile.writelines(lines)
Exemple #3
0
def dump_text(base_dir):
  opts = options.Options()
  opts.headless = False
  browser = webdriver.Chrome(options=opts)
  browser.implicitly_wait(6)
  unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "vedaH/shaunaka/samhitA.json")

  for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]):
    subunit_list = text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[kaanda_index])
    for subunit_index in subunit_list:
      logging.info("kaanDa %d adhyaaya %d", kaanda_index, subunit_index)

      outfile_path = os.path.join(base_dir, "%02d" % (kaanda_index), "%03d.md" % subunit_index)
      if os.path.exists(outfile_path):
        logging.info("Skipping " + outfile_path)
        continue

      url = "http://vedicheritage.gov.in/samhitas/atharvaveda-samhitas/shaunaka-samhita/kanda-%02d-sukta-%03d/" % (
      kaanda_index, subunit_index)
      logging.info("url %s to %s", url, outfile_path)
      browser.get(url=url)
      text = browser.find_element_by_id("videotext").text
      text = text.replace("\n", "  \n")
      title_tags = browser.find_elements_by_css_selector("#videotext  strong")
      title = "%03d" % subunit_index
      if len(title_tags) > 0:
        title = "%03d %s" % (subunit_index, title_tags[0].text)
      title = sanscript.transliterate(title, sanscript.HK, sanscript.DEVANAGARI)
      md_file = MdFile(file_path=outfile_path)
      md_file.dump_to_file(metadata={"title": title}, content=text, dry_run=False)

  browser.close()
Exemple #4
0
def test_get_subunit_list():
    unit_info_file = os.path.join(os.path.dirname(text_data.__file__),
                                  "shatapatha.json")
    assert text_data.get_subunit_list(json_file=unit_info_file,
                                      unit_path_list=[]) == range(1, 15)
    assert text_data.get_subunit_list(json_file=unit_info_file,
                                      unit_path_list=[2]) == range(1, 7)
def dump_text(base_dir):
    unit_info_file = os.path.join(os.path.dirname(text_data.__file__),
                                  "veda/shaunaka/samhitA.json")

    for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file,
                                                   unit_path_list=[]):
        subunit_list = text_data.get_subunit_list(
            json_file=unit_info_file, unit_path_list=[kaanda_index])
        for subunit_index in subunit_list:
            logging.info("kaanDa %d adhyaaya %d", kaanda_index, subunit_index)

            outfile_path = os.path.join(base_dir, "%02d" % (kaanda_index),
                                        "%03d.md" % subunit_index)
            if os.path.exists(outfile_path):
                logging.info("Skipping " + outfile_path)
                continue

            url = "http://vedicheritage.gov.in/samhitas/atharvaveda-samhitas/shaunaka-samhita/kanda-%02d-sukta-%03d/" % (
                kaanda_index, subunit_index)
            logging.info("url %s to %s", url, outfile_path)
            browser.get(url=url)
            text = browser.find_element_by_id("videotext").text
            text = text.replace("\n", "  \n")
            os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True)
            with open(outfile_path, "w") as outfile:
                logging.debug(text)
                outfile.write(text)
Exemple #6
0
def get_text(text_id, base_dir):
    unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "mahaabhaaratam/kumbhakonam.json")
    if text_id == "BORI":
        unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "mahaabhaaratam/bori.json")
    elif text_id == "KK":
        unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "mahaabhaaratam/kumbhakonam.json")
    elif text_id == "SV":
        unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "mahaabhaaratam/vAvilla.json")

    for book_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]):
        book_index = "%02d" % book_index
        chapter_list = text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[book_index])

        for chapter_index in chapter_list:
            infile_path = "http://mahabharata.manipal.edu/anu-projects/MAHE/apiphpv5/readMaha2.php?src=%s&parva=%s&adh=%03d" % (text_id, book_index, chapter_index)
            outfile_path = os.path.join(base_dir, str(book_index), "%03d.md" % chapter_index)
            if os.path.exists(outfile_path):
                logging.warning("Skipping " + outfile_path)
                continue
            logging.info("Book %s chapter %d url: %s outpath: %s", book_index, chapter_index, infile_path, outfile_path)

            os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True)
            resource = urllib.request.urlopen(infile_path)
            content =  resource.read().decode("utf-8")
            chapter_lines = [line["text"] + "  \n" for line in json.loads(content)]
            if len(chapter_lines) > 0:
                with open(outfile_path, "w") as outfile:
                    outfile.writelines(chapter_lines)
            else:
                logging.error("No lines found for %s:%s-%03d", text_id, book_index, chapter_index)
def dump_text(base_dir):
    unit_info_file = os.path.join(os.path.dirname(text_data.__file__),
                                  "vedaH/vAjasaneyi/samhitA.json")

    for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file,
                                                   unit_path_list=[]):
        logging.info("adhyAya %d", kaanda_index)

        outfile_path = os.path.join(base_dir, "%02d.md" % (kaanda_index))
        if os.path.exists(outfile_path):
            logging.info("Skipping " + outfile_path)
            continue

        url = "http://vedicheritage.gov.in/samhitas/yajurveda/shukla-yajurveda/vajasaneyi-kanva-samhita-chapter-%02d/" % (
            kaanda_index)
        logging.info("url %s to %s", url, outfile_path)
        browser.get(url=url)
        try:
            text = browser.find_element_by_id("videotext").text
            text = text.replace("\n", "  \n")
            title = "%02d" % kaanda_index
            title = sanscript.transliterate(title, sanscript.HK,
                                            sanscript.DEVANAGARI)
            md_file = MdFile(file_path=outfile_path)
            md_file.dump_to_file(metadata={"title": title},
                                 md=text,
                                 dry_run=False)
        except NoSuchElementException:
            logging.warning("Page missing! %s ", url)
def get_ramayana_text(browser, text_id, base_dir):
    browser.find_element_by_link_text(text_id).click()
    # browser.implicitly_wait(2)
    unit_info_file = os.path.join(os.path.dirname(text_data.__file__),
                                  "raamaayana/andhra.json")
    if text_id == "रामायणम्-नव्यपाठः":
        unit_info_file = os.path.join(os.path.dirname(text_data.__file__),
                                      "raamaayana/baroda.json")
    else:
        unit_info_file = os.path.join(os.path.dirname(text_data.__file__),
                                      "raamaayana/kumbhakonam.json")

    for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file,
                                                   unit_path_list=[]):
        kaanda_element = browser.find_element_by_link_text("Kanda-%d" %
                                                           kaanda_index)
        # kaanda_element.click()
        # Sometimes headless browser fails with selenium.common.exceptions.ElementClickInterceptedException: Message: element click intercepted . Then, non-headless browser works fine! Or can try https://stackoverflow.com/questions/48665001/can-not-click-on-a-element-elementclickinterceptedexception-in-splinter-selen
        browser.execute_script("arguments[0].click();", kaanda_element)
        sarga_list = text_data.get_subunit_list(json_file=unit_info_file,
                                                unit_path_list=[kaanda_index])

        for sarga_index in sarga_list:
            logging.info("Kanda %d Sarga %d", kaanda_index, sarga_index)
            outfile_path = os.path.join(base_dir, str(kaanda_index),
                                        "%03d" % sarga_index + ".md")
            if os.path.exists(outfile_path):
                logging.info("Skipping " + outfile_path)
                continue
            browser.find_element_by_link_text("Sarga-%d" % sarga_index).click()
            text_spans = browser.find_element_by_id(
                "divResults").find_elements_by_tag_name("span")
            lines = ["\n", "\n"]
            for span in text_spans:
                shloka = span.text
                shloka = shloka.replace("। ", "।  \n")
                shloka = shloka.replace("।।", " ॥ ")
                lines.append(shloka + "  \n")
            os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True)
            with open(outfile_path, "w") as outfile:
                outfile.writelines(lines)
        # Close the kANDa - else the driver may pick sarga from this kANDa when it is to pick the sarga from the next kANDa?!
        browser.find_element_by_link_text("Kanda-%d" % kaanda_index).click()
Exemple #9
0
def dump_text(base_dir, do_transliteration=False):
    unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "vedaH/vAjasaneyi/samhitA.json")

    titus_url = "http://titus.uni-frankfurt.de/texte/etcd/ind/aind/ved/yvw/vs/vs.htm"
    for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]):
        logging.info("kaanDa %d", kaanda_index)

        outfile_path = os.path.join(base_dir, "%02d.md" % (kaanda_index))
        if os.path.exists(outfile_path):
            logging.info("Skipping " + outfile_path)
            continue

        titus.navigate_to_part(base_page_url=titus_url, level_3_id=kaanda_index, level_3_frame="etaindexb")
        sentences = titus.dump_text()
        os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True)
        with open(outfile_path, "w") as outfile:
            outfile.write("  \n".join(sentences))