Python transliterate Exemples, indic_transliteration.sanscript.transliterate Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : sb.py Projet : sanskrit-code/doc_curation

def dump_text(base_dir, do_transliteration=False):
    unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "shatapatha.json")

    titus_url = "http://titus.uni-frankfurt.de/texte/etcs/ind/aind/ved/yvw/sbm/sbm.htm"
    for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]):
        sarga_list = text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[kaanda_index])
        for sarga_index in sarga_list:
            logging.info("kaanDa %d adhyaaya %d", kaanda_index, sarga_index)

            outfile_path = os.path.join(base_dir, "%02d" % (kaanda_index), "%02d" % sarga_index + ".md")
            if os.path.exists(outfile_path):
                logging.info("Skipping " + outfile_path)
                continue

            titus.navigate_to_part(base_page_url=titus_url, level_3_id=kaanda_index, level_4_id=sarga_index)
            sentences = titus.get_text()
            lines = ["\n"]
            for sentence in sentences:
                sentence = roman.RomanScheme.simplify_accent_notation(sentence)
                sentence = sentence.replace("/", ".")
                if not sentence.endswith("."):
                    sentence = sentence + ".."
                if do_transliteration:
                    if kaanda_index == 12:
                        sentence = sanscript.transliterate(sentence, sanscript.IAST, sanscript.DEVANAGARI)
                    else:
                        sentence = sanscript.transliterate(sentence, sanscript.TITUS, sanscript.DEVANAGARI)
                    sentence = roman.RomanScheme.to_shatapatha_svara(sentence)
                lines.append(sentence + "  \n")
            os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True)
            with open(outfile_path, "w") as outfile:
                outfile.writelines(lines)

Exemple #2

0

Afficher le fichier

Fichier : basic_test.py Projet : zhongys/indic_transliteration

def test_devanaagarii_equivalence():
    """Test all synonmous transliterations."""
    logging.info(
        sanscript.transliterate("rAmo gUDhaM vaktI~Ngitaj~naH kShetre",
                                sanscript.ITRANS, sanscript.DEVANAGARI))
    assert sanscript.transliterate("rAmo gUDhaM vaktI~Ngitaj~naH kShetre", sanscript.ITRANS, sanscript.DEVANAGARI) == \
                     sanscript.transliterate("raamo guuDhaM vaktii~NgitaGYaH xetre", sanscript.ITRANS, sanscript.DEVANAGARI)

Exemple #3

0

Afficher le fichier

Fichier : structures.py Projet : sanskrit-coders/dict-curation

	def get_headers(self):
		return [
			self.gana_varga_naama,
			NIGHANTU_NAAMA + " - _" + self.index,
			transliterate(self.gana_varga_naama, DEVANAGARI, ITRANS),
			transliterate(NIGHANTU_NAAMA + " - _" + self.index, DEVANAGARI, ITRANS)
		]

Exemple #4

0

Afficher le fichier

def get_file_path(out_dir, title_iast, author_iast="", catalog_number=""):
    title_optitrans = sanscript.transliterate(data=title_iast, _from=sanscript.IAST, _to=sanscript.OPTITRANS)
    author_optitrans = sanscript.transliterate(data=author_iast, _from=sanscript.IAST, _to=sanscript.OPTITRANS)
    file_path = "%s_%s_%s.md" % (title_optitrans, author_optitrans, catalog_number.strip())
    file_path = file_helper.clean_file_path(file_path=file_path)
    file_path = os.path.join(out_dir, file_path)
    return file_path

Exemple #5

0

Afficher le fichier

Fichier : naaraayaniiyam.py Projet : sanskrit-code/doc_curation

def get_item(id, dir_path):
    import urllib.parse
    dashaka_id = "नारायणीयम्/दशकम्_%s" % sanscript.transliterate(
        str(id), sanscript.SLP1, sanscript.DEVANAGARI)
    logging.info(dashaka_id)
    item_url = "https://sa.wikisource.org/wiki/" + urllib.parse.quote(
        dashaka_id)
    logging.info(item_url)
    browser.get(item_url)
    text = browser.find_element_by_css_selector("div.poem").text
    text = text.replace("cअ", "च").replace("cइ", "चि").replace(
        "cई", "ची").replace("cउ", "चु").replace("cऊ", "चू").replace(
            "cऋ", "चृ").replace("cॠ", "चॄ").replace("cऌ", "चॢ").replace(
                "cॡ", "चॣ").replace("cए", "चे").replace("cऐ", "चै").replace(
                    "cओ", "चो").replace("cऔ",
                                        "चौ").replace("c",
                                                      "च्").replace("ळ", "ल")
    shlokas = text.split("\n\n")
    outfile_path = os.path.join(dir_path, "%03d.md" % id)
    os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True)
    with open(outfile_path, "w") as outfile:
        for shloka_id in range(1, len(shlokas) + 1):
            outfile.write(
                "<div class=\"audioEmbed\"  caption=\"सीतालक्ष्मी-वाचनम्\" src=\"https://sanskritdocuments.org/sites/completenarayaneeyam/SoundFiles/%03d/%03d_%02d.mp3\"></div>  \n"
                % (id, id, shloka_id))
            outfile.writelines(shlokas[shloka_id - 1].replace("\n", "  \n") +
                               "\n\n")
    md_file = md_helper.MdFile(file_path=outfile_path)
    md_file.set_title(sanscript.transliterate("%03d" % id, sanscript.SLP1,
                                              sanscript.DEVANAGARI),
                      dry_run=False)

Exemple #6

0

Afficher le fichier

 def fix_lazy_anusvaara(self, data_in, omit_sam=False, omit_yrl=False, ignore_padaanta=False):
     from indic_transliteration import sanscript
     if ignore_padaanta:
         return self.fix_lazy_anusvaara_except_padaantas(data_in=data_in, omit_sam=omit_sam, omit_yrl=omit_yrl)
     data_out = sanscript.transliterate(data=data_in, _from=self.name, _to=sanscript.DEVANAGARI)
     data_out = sanscript.SCHEMES[sanscript.DEVANAGARI].fix_lazy_anusvaara(data_in=data_out, omit_sam=omit_sam, omit_yrl=omit_yrl)
     return sanscript.transliterate(data=data_out, _from=sanscript.DEVANAGARI, _to=self.name)

Exemple #7

0

Afficher le fichier

def get_storage_name(text,
                     source_script=None,
                     max_length=50,
                     maybe_use_dravidian_variant=True,
                     mixed_languages_in_titles=True):
    from indic_transliteration import detect
    if source_script is None:
        source_script = detect.detect(text=text)
    text_optitrans = regex.sub("/ *", "__", text)
    if source_script in roman.ALL_SCHEME_IDS:
        if source_script in roman.CAPITALIZABLE_SCHEME_IDS:
            if mixed_languages_in_titles:
                text_optitrans = sanscript.SCHEMES[
                    sanscript.IAST].mark_off_non_indic_in_line(text_optitrans)
            text_optitrans = sanscript.transliterate(
                text_optitrans,
                source_script,
                sanscript.OPTITRANS,
                suspend_on=set('<'),
                suspend_off=set('>'),
                maybe_use_dravidian_variant=maybe_use_dravidian_variant)
    else:
        text_optitrans = sanscript.transliterate(
            text_optitrans,
            source_script,
            sanscript.OPTITRANS,
            maybe_use_dravidian_variant=maybe_use_dravidian_variant)
    storage_name = clean_file_path(text_optitrans)
    if max_length is not None:
        storage_name = storage_name[:max_length]
    return storage_name

Exemple #8

0

Afficher le fichier

def extractFirstPage(im, coords):
    """
    im: Image object 
    coords: List of size-4 tuples. Order: Box-1 containing mandal name, pin code etc.
                                          Box-2 polling station name
                                          Box-3 polling station address
                                          Box-4 net electors male
                                          Box-5 net electors female
                                          Box-6 net electors third gender
                                          Box-7 total electors
    """
    im.seek(0)
    resAll = dealWithFirstPage(im, coords)
    res2 = resAll[0].split("\n")
    resNoEmp = [i for i in res2 if not i == '']
    main_town = (resNoEmp[0].split("ः")[1].strip() if len(
        resNoEmp[0].split(":")) == 1 else resNoEmp[0].split(":")[1]).strip()
    police_station = (resNoEmp[5].split("ः")[1].strip() if len(
        resNoEmp[5].split(":")) == 1 else resNoEmp[5].split(":")[1]).strip()
    pin_code = (resNoEmp[8].split("ः")[1].strip() if len(
        resNoEmp[8].split(":")) == 1 else resNoEmp[8].split(":")[1]).strip()
    main_town_eng = transliterate(main_town, sanscript.DEVANAGARI,
                                  sanscript.ITRANS)
    police_station_eng = transliterate(police_station, sanscript.DEVANAGARI,
                                       sanscript.ITRANS)
    polling_station_name = transliterate(resAll[1], sanscript.DEVANAGARI,
                                         sanscript.ITRANS)
    polling_station_address = transliterate(resAll[2], sanscript.DEVANAGARI,
                                            sanscript.ITRANS)
    return [
        main_town_eng, police_station_eng, pin_code, polling_station_name,
        polling_station_address, resAll[3], resAll[4], resAll[5], resAll[6],
        main_town, police_station, resAll[1], resAll[2]
    ]

Exemple #9

0

Afficher le fichier

Fichier : structures.py Projet : sanskrit-coders/dict-curation

	def get_headers(self):
		return [
			self.varga_naama,
			NIGHANTU_NAAMA + ' _' + transliterate(self.index, ITRANS, DEVANAGARI),
			transliterate(self.varga_naama, DEVANAGARI, ITRANS),
			transliterate(NIGHANTU_NAAMA, DEVANAGARI, ITRANS) + ' _' + self.index,
			'abdm _' + self.index
		]

Exemple #10

0

Afficher le fichier

def transliterate(text, toDevanagari=False):
    if toDevanagari:
        text = sanscript.transliterate(text, sanscript.DEVANAGARI,
                                       sanscript.IAST)
    else:
        text = sanscript.transliterate(text, sanscript.IAST,
                                       sanscript.DEVANAGARI)
    return text

Exemple #11

0

Afficher le fichier

Fichier : structures.py Projet : sanskrit-coders/dict-curation

	def get_headers(self):
		if not self.shloka_sankhya:
			return []
		shloka_sankhya_it = transliterate(self.shloka_sankhya, DEVANAGARI, ITRANS)
		return [
			NIGHANTU_NAAMA + ' ' + self.shloka_sankhya,
			transliterate(NIGHANTU_NAAMA, DEVANAGARI, ITRANS) + ' ' + shloka_sankhya_it,
			'abdm {}'.format(shloka_sankhya_it)
		]

Exemple #12

0

Afficher le fichier

Fichier : roman.py Projet : zhongys/indic_transliteration

 def get_standard_form(self, data):
     """Roman schemes define multiple representations of the same devanAgarI character. This method gets a library-standard representation.
     
     data : a text in the given scheme.
     """
     if self.synonym_map is None:
         return data
     from indic_transliteration import sanscript
     return sanscript.transliterate(data=sanscript.transliterate(_from=self.name, _to=sanscript.DEVANAGARI, data=data), _from=sanscript.DEVANAGARI, _to=self.name)

Exemple #13

0

Afficher le fichier

Fichier : sanscript_test.py Projet : karthikraman/indic_transliteration

 def test_devanaagarii_equivalence(self):
     """Test all synonmous transliterations."""
     print S.transliterate("rAmo gUDhaM vaktI~Ngitaj~naH kShetre", S.ITRANS,
                           S.DEVANAGARI),
     self.assertEqual(
         S.transliterate("rAmo gUDhaM vaktI~Ngitaj~naH kShetre", S.ITRANS,
                         S.DEVANAGARI),
         S.transliterate("raamo guuDhaM vaktii~NgitaGYaH xetre", S.ITRANS,
                         S.DEVANAGARI))

Exemple #14

0

Afficher le fichier

Fichier : manualCorrection.py Projet : drdhaval2785/ChunilalGandhiVidyabhavan

def repl(matchobject):
    data = transliterate(matchobject.group(0), 'devanagari', 'slp1')
    data = re.sub('M([kKgG])', 'N\g<1>', data)
    data = re.sub('M([cCjJ])', 'Y\g<1>', data)
    data = re.sub('M([wWqQ])', 'R\g<1>', data)
    data = re.sub('M([tTdD])', 'n\g<1>', data)
    data = re.sub('M([pPbB])', 'm\g<1>', data)
    data = transliterate(data, 'slp1', 'devanagari')
    return data

Exemple #15

0

Afficher le fichier

def separate_rks(dry_run=False):
  dest_dir_Rks = "/home/vvasuki/vvasuki-git/vedAH/static/atharva/shaunakam/rUDha-saMhitA/mUlam/"
  suukta_paths = glob.glob("/home/vvasuki/vvasuki-git/vedAH/content/atharva/shaunakam/rUDha-saMhitA_alt/*/*.md",
                           recursive=True)

  for suukta_path in suukta_paths:
    md_file = MdFile(file_path=suukta_path)
    [metadata, md] = md_file.read_md_file()
    lines = md.split("\n")
    meta_lines = list(itertools.takewhile(lambda line: "॒" not in line and "॑" not in line, lines))
    lines = list(itertools.dropwhile(lambda line: "॒" not in line and "॑" not in line, lines))
    lines = [line for line in lines if line != ""]
    rk_id = 0
    chapter_id = suukta_path.split("/")[-2]
    suukta_id = metadata["title"].split()[0]
    suukta_id_roman = sanscript.transliterate(suukta_id, sanscript.DEVANAGARI, sanscript.IAST)
    suukta_title = " ".join(metadata["title"].split()[1:]).replace("।", "").strip()
    dest_path_suukta = os.path.join(dest_dir_suuktas, chapter_id, suukta_id_roman + ".md")
    rk_map = {}
    while(len(lines) > 0):
      lines_rk = list(itertools.takewhile(lambda line: "॥" not in line, lines))
      lines_rk.append(lines[len(lines_rk)])
      if len(lines) == len(lines_rk):
        lines = []
      else:
        lines = lines[len(lines_rk):]
      rk_id = rk_id + 1
      rk_md = "\n".join(lines_rk)

      rk_id_str = sanscript.transliterate("%02d" % rk_id, sanscript.IAST, sanscript.DEVANAGARI) 
      from doc_curation import text_data
      title_Rk = text_data.get_rk_title(rk_id=rk_id_str, rk_text=rk_md)
      dest_path_Rk = os.path.join(dest_dir_Rks, chapter_id, suukta_id_roman, sanscript.transliterate(rk_id_str, sanscript.DEVANAGARI, sanscript.IAST) + ".md")
      md_file_Rk = MdFile(file_path=dest_path_Rk)
      md_file_Rk.dump_to_file(metadata={"title": title_Rk}, content=rk_md, dry_run=dry_run)
      md_file_Rk.set_filename_from_title(transliteration_source=sanscript.DEVANAGARI, dry_run=dry_run)
      rk_map[rk_id_str] = md_file_Rk.file_path

    suukta_md = ""
    for rk_id in sorted(rk_map.keys()):
      dest_path_Rk = rk_map[rk_id]
      suukta_md = suukta_md + """
      <div class="js_include" url="%s"  newLevelForH1="2" includeTitle="false"> </div> 
      """ % dest_path_Rk.replace("/home/vvasuki/vvasuki-git", "").replace("static/", "")

    import textwrap
    suukta_md = """
    ## परिचयः
    %s
    
    ## पाठः
    %s
    """ % ("\n    ".join(meta_lines), suukta_md)
    md_file_suukta = MdFile(file_path=dest_path_suukta)
    md_file_suukta.dump_to_file(metadata={"title": "%s %s" % (suukta_id, suukta_title)}, content=textwrap.dedent(suukta_md), dry_run=dry_run)
    md_file_suukta.set_filename_from_title(transliteration_source=sanscript.DEVANAGARI, dry_run=dry_run)

Exemple #16

0

Afficher le fichier

Fichier : UPExtraction.py Projet : mooncrater31/pdfExtraction

def tessBox(box_lst):
    """
    box_lst : Input format : A list of lists, with each inner list being of the format:
        0: Main box
        1: voterIDBox
        2: boxNumber 
        3: Pagenumber
    
    """
    rows = []
    st = time.time()
    indicesWithErrors = []
    resLst = picEater(box_lst)
    for i, rex in enumerate(resLst):
        op = rex[0].communicate()[0].decode('utf-8')
        voter_id = rex[1].communicate()[0].decode('utf-8')
        voter_id = dealWithID(voter_id)
        boxNumber = rex[2].communicate()[0].decode('utf-8')
        print("box number :" + str(i))
        try:
            res = op.split("\n")
            res = [i for i in res if not i == '']
            name_regional = (res[0].split("ः")[1] if len(res[0].split(":"))
                             == 1 else res[0].split(":")[1]).strip()
            husband_or_father_regional = (res[1].split("ः")[1] if len(
                res[0].split(":")) == 1 else res[1].split(":")[1]).strip()
            house_number = ''
            if (len(res[2].split(":")) > 1):
                house_number = res[2].split(":")[1].strip()
            elif (len(res[2].split("ः")) > 1):
                house_number = res[2].split("ः")[1].strip()
            house_number = dealWithHouses(house_number)
            has_husband = 1 if (
                res[1].split("ः")[0] if len(res[0].split(":")) == 1 else
                res[1].split(":")[0]).split(" ")[0].strip() == "पति" else 0
            age = re.search('[0-9]+', res[len(res) - 1]).group(0)
            if (int(age) < 18):
                raise ValueError()
            intermediate = res[len(res) - 1].split(" ")
            gender = 'F' if re.match('म', intermediate[
                len(intermediate) -
                1]) != None else 'M'  #3 for other states, 4 for UK.
            opName = transliterate(name_regional, sanscript.DEVANAGARI,
                                   sanscript.ITRANS)
            opHusband_or_father = transliterate(husband_or_father_regional,
                                                sanscript.DEVANAGARI,
                                                sanscript.ITRANS)
            rows.append([
                opName, age, gender, opHusband_or_father, has_husband,
                house_number, voter_id, boxNumber, box_lst[i][3],
                name_regional, husband_or_father_regional
            ])
        except:
            indicesWithErrors.append(i)
            print("Had Errors.")
    return rows, indicesWithErrors

Exemple #17

0

Afficher le fichier

 def fix_lazy_anusvaara(self, data_in):
     from indic_transliteration import sanscript
     data_itrans = sanscript.transliterate(data=data_in,
                                           _from=self.name,
                                           _to=sanscript.ITRANS)
     itrans_fixed = sanscript.SCHEMES[sanscript.ITRANS].fix_lazy_anusvaara(
         data_in=data_itrans)
     return sanscript.transliterate(data=itrans_fixed,
                                    _from=sanscript.ITRANS,
                                    _to=self.name)

Exemple #18

0

Afficher le fichier

 def equal_dvng(self, w1, w2):
     ma = "म्"
     ma = ma.decode('utf-8')
     if not (w1.endswith(ma) or w2.endswith(ma)):
         return w1 == w2
     w1 = sanscript.transliterate(w1, sanscript.DEVANAGARI, sanscript.SLP1)
     w2 = sanscript.transliterate(w2, sanscript.DEVANAGARI, sanscript.SLP1)
     w1 = re.sub(r'[mM]$', 'm', w1)
     w2 = re.sub(r'[mM]$', 'm', w2)
     return w1 == w2

Exemple #19

0

Afficher le fichier

def convert(text, intran, outtran):
    """Convert a text from intran to outtran transliteration."""
    result = ''
    if intran == outtran:
        result = text
    elif sys.version_info[0] < 3:
        result = sanscript.transliterate(text, intran,
                                         outtran).replace(u'|', u'.')
    else:
        result = sanscript.transliterate(text, intran,
                                         outtran).replace('|', '.')
    return result

Exemple #20

0

Afficher le fichier

def test_from_devanagari(test_case):
    dev_string = test_case["dev_string"]
    script = test_case["script"]
    expected_text = test_case["text"]
    # logging.debug("Converting %s, expecting %s in %s" % (dev_string, expected_text, script))
    if script in SCRIPT_NAME_MAP.keys():
        script = SCRIPT_NAME_MAP[script]
    if script in "dev" or (script not in sanscript.SCHEMES.keys()):
        logging.debug("Skipping over script - " + script)
        return
    result = sanscript.transliterate(dev_string, sanscript.DEVANAGARI, script)
    result_dev = sanscript.transliterate(result, script, sanscript.DEVANAGARI)
    assert expected_text == result or dev_string == result_dev, "Failed to convert to " + script + " from devanAgarI: got " + result + " instead of " + expected_text

Exemple #21

0

Afficher le fichier

 def back_transliterate_word(self, word: str, predicted_lang_id=None):
     word = word.lower()
     if not self.en_dict.check(word) and predicted_lang_id != 1:
         word = unicodedata.normalize('NFKC', word)
         word = transliterate(word, sanscript.KOLKATA, sanscript.DEVANAGARI)
     else:
         transliterated_word = unicodedata.normalize('NFKC', word)
         transliterated_word = transliterate(transliterated_word,
                                             sanscript.KOLKATA,
                                             sanscript.DEVANAGARI)
         if unicodedata.normalize('NFKC',
                                  transliterated_word) in self.hindi_words:
             word = transliterated_word
     return unicodedata.normalize('NFKC', word).upper()

Exemple #22

0

Afficher le fichier

Fichier : lexicon_parser.py Projet : hareeshbabu82ns/sanskrit-utils

    def handle_data(self, data):
        # print("Encountered some data  :", self.current_tag, ': ', data)
        final_data = data
        if self.current_tag in ['l', 'pc']:  # as of not not using this info
            return
        if self.current_tag in ['key1', 'key2'] and self.key_fromLang != self.key_toLang:
            final_data = transliterate(
                data, self.key_fromLang, self.key_toLang)
        if self.current_tag == 's' and self.fromLang != self.toLang:
            # sanskrit word
            final_data = transliterate(
                data, self.fromLang, self.toLang)

        self.mark_down = self.mark_down + final_data

Exemple #23

0

Afficher le fichier

Fichier : structures.py Projet : sanskrit-coders/dict-curation

	def get_headers(self):
		dhaatu_headers = [
			self.dhaatu_naama,
			NIGHANTU_NAAMA + ' _' + transliterate(self.index, ITRANS, DEVANAGARI),
			transliterate(self.dhaatu_naama, DEVANAGARI, ITRANS),
			transliterate(NIGHANTU_NAAMA, DEVANAGARI, ITRANS) + ' _' + self.index,
			'abdm _' + self.index
		]
		for shloka in self.shlokas:
			shloka_headers = shloka.get_headers()
			if shloka_headers is None:
				continue
			dhaatu_headers.extend(shloka_headers)
		return dhaatu_headers

Exemple #24

0

Afficher le fichier

def unused_convert_sanskrit(text, inTran, outTran):
    """Return transliterated adjusted text."""

    text1 = ''
    counter = 0
    # Remove '<srs/>'
    text = text.replace('<srs/>', '')
    # Change the s tag to span.
    for i in re.split('<s>([^<]*)</s>', text):
        if counter % 2 == 0:
            text1 += i
        else:
            text1 += '<span class="s">' + sanscript.transliterate(
                i, 'slp1', outTran) + '</span>'
        counter += 1
    # PE nesting of LB tag
    text1 = text1.replace('<div n="1"/>', 'emsp;<div n="1"></div>')
    text1 = text1.replace('<div n="2"/>', 'emsp;emsp;<div n="2"></div>')
    text1 = text1.replace('<div n="3"/>', 'emsp;emsp;emsp;<div n="3"></div>')
    text1 = text1.replace('<div n="4"/>',
                          'emsp;emsp;emsp;emsp;<div n="4"></div>')
    text1 = text1.replace('<div n="5"/>',
                          'emsp;emsp;emsp;emsp;emsp;<div n="5"></div>')
    text1 = re.sub('<div n="([^"]*)"/>', '<div n="\g<1>"></div>', text1)
    text1 = text1.replace('<lb/>', '<br />')
    # AP90 compounds and meanings break
    text1 = text1.replace('<b>--', '<br /><b>--')
    text1 = text1.replace('<span class="s">--', '<br /><span class="s">--')
    # — breaks
    text1 = text1.replace('— ', '<br />— ')
    return text1

Exemple #25

0

Afficher le fichier

Fichier : enumerated.py Projet : lalitaalaalitah/doc_curation

def dump_deep_text(url_text_id,
                   url_leaf_id_padding,
                   dir_path,
                   unit_info_file,
                   get_collapsible_content=False,
                   dry_run=False):
    unit_data = text_data.get_subunit_data(unit_info_file, [])
    for subunit_path in text_data.get_subunit_path_list(
            json_file=unit_info_file, unit_path_list=[]):
        relative_dir_path = "/".join(["%02d" % x for x in subunit_path[:-1]])
        outfile_path = os.path.join(dir_path, relative_dir_path,
                                    "%03d.md" % subunit_path[-1])
        import urllib
        item_url = "https://sa.wikisource.org/wiki/%s/%s" % (
            urllib.parse.quote(url_text_id),
            get_wiki_path(subunit_path=subunit_path,
                          unit_data=unit_data,
                          url_id_padding=url_leaf_id_padding))
        title = sanscript.transliterate("%03d" % subunit_path[-1],
                                        sanscript.SLP1, sanscript.DEVANAGARI)
        logging.info("Getting %s to %s with title %s", item_url, outfile_path,
                     title)
        if not dry_run:
            dump_item(title=title,
                      outfile_path=outfile_path,
                      item_url=item_url,
                      get_collapsible_content=get_collapsible_content)

Exemple #26

0

Afficher le fichier

 def devanagari(self, strict_io=True):
     """ Return devanagari transcoding of self
     """
     s = self.thing
     if not strict_io:
         s = normalization.denormalize(s)
     return sanscript.transliterate(s, SLP1, DEVANAGARI)

Exemple #27

0

Afficher le fichier

def create_slp(code):
    """Create SLP file for a given dictionary code.

    code is to be selected from dictcode.json.
    """
    # ENSK -> ekaksharanamamala_sadhukalashagani
    fullName = utils.code_to_dict(code)
    # ekaksharanamamala, sadhukalashagani
    bookName, author = fullName.split('_')
    # Read the .txt file
    filein = os.path.join('..', fullName, 'orig', bookName + '.txt')
    fin = codecs.open(filein, 'r', 'utf-8')
    data = fin.read()
    fin.close()
    # Convert the data to SLP1.
    data = sanscript.transliterate(data, 'devanagari', 'slp1')
    # Output directory
    directory = os.path.join('..', fullName, 'slp')
    # Create if the directory does not exist.
    if not os.path.exists(directory):
        os.mkdir(directory)
    fileout = os.path.join(directory, bookName + '.txt')
    # Create output file and save the SLP data in it.
    fout = codecs.open(fileout, 'w', 'utf-8')
    fout.write(data)
    fout.close()

Exemple #28

0

Afficher le fichier

Fichier : scrape_rkencp.py Projet : sanskrit-coders/doc_curation

def hyperlink(bl_file_path, all_words: list):
    if sys.platform != 'linux':
        print(
            'hyperlinking currently requires sed, which may exist only on unix compilant platforms'
        )
        sys.exit(1)

    print('\nhyperlinking.... may take a minute')
    considered_words = filter(lambda w: len(w) > 3 and '"' not in w, all_words)
    sorted_words = sorted(considered_words, key=lambda w: len(w), reverse=True)

    batch_size = 100
    no_batches = math.ceil(len(sorted_words) / batch_size)

    for bno in range(no_batches):
        words = sorted_words[batch_size * bno:batch_size * (bno + 1)]
        sed_command = 'sed -i \'' + '; '.join([
            r'7~3s#\([,;\. -]\)\({w}\)\([,;-\. -]\)#\1<a href="{s}">{s}</a>\3#ig'
            .format(w=w,
                    s=sanscript.transliterate(
                        w.lower(), scheme_map=HWS_XLITERATE_SCHEME_MAPS[0]))
            for w in words
        ]) + '\' "{}"'.format(bl_file_path)
        #  print(sed_command)
        os.system(sed_command)

Exemple #29

0

Afficher le fichier

Fichier : end_to_end_test.py Projet : saibharani/indic_transliteration

def test_to_devanagari(test_case):
    logging.debug(str(test_case))
    dev_string = test_case["dev_string"]
    script = test_case["script"]
    text = test_case["text"]
    result = sanscript.transliterate(text, script, sanscript.DEVANAGARI)
    assert result == dev_string, "Failed to convert " + script + " to devanAgarI: got " + result + " instead of " + dev_string

Exemple #30

0

Afficher le fichier

Fichier : readwriteA4_copy.py Projet : sanskrit-lexicon/MD

def adjustlines(lines):
    newlines = [
    ]  # start with an empty list, in which the new lines will be put
    # adjust each line in a python 'for loop'

    for line in lines:
        # we know, from the way read_lines was constructed, that each element
        # in lines represents a line of the text file used as input ('filein')
        # As such, it is a python 'str' (for 'string'). There is a buildin way to
        # split strings into a list
        x1 = line.split(":")
        newline1 = x1[0]
        x2 = line.split(":")
        newline2 = x1[1]
        x3 = newline2.split()
        newline3 = x3[0]
        newline3a = re.sub(r"([-~*‘’])|(\[a\])", "", newline3)
        iastrev = remove_accent(newline3a)
        newline4 = sanscript.transliterate(newline1, 'slp1', 'iast')
        # We want to add the new line to our list of new lines.
        # 'append' is the way to do that
        newlines.append('%s' % "-" * 15)
        newlines.append('orig = %s' % line)
        newlines.append('slp1 = %s' % newline1)
        newlines.append('rest = %s' % newline2)
        newlines.append('iast = %s' % newline3a)
        newlines.append('iastrev = %s' % iastrev)
        newlines.append('slp-iast = %s' % newline4)
    # we're done with the for loop, so we go back one level of indentation
    # We need to return the newlines object that this function computed
    return newlines

Exemple #31

0

Afficher le fichier

Fichier : identify_test.py Projet : sanskrit-coders/Chandas

def test_syllables(test_case):
    logging.debug(str(test_case))
    pattern_lines = identify.to_pattern_lines(test_case["verse"].split("\n"))
    id_result = identify.identifier.IdentifyFromPatternLines(pattern_lines)
    assert 'exact' in id_result, id_result
    exact_matches = [sanscript.transliterate(metre.lower(), _from=sanscript.IAST, _to=sanscript.DEVANAGARI) for metre in id_result['exact'].keys()]
    assert exact_matches == test_case["exactMatches"], id_result