Beispiel #1
0
def adjust_wx(x):
 # modfied to return both slp1 
 # headword entries start with a <wx-headword> line and
 # end with a </wx-headword> line.
 # convert these to <slp-headword> 
 # and </slp-headword>
 m = re.search(r'^<(/?)(.*?)>$',x)
 if m:
  x1 = m.group(1)
  x2 = m.group(2)
  y2 =  transcoder.transcoder_processString(x2,'wx','slp1')
  ans = "<%s%s>" %(x1,y2)
  return ans
 # presumably, not a headword. Don't transcode xml tags
 outarr = [] # slp1
 parts = re.split(r'(<[^>]+>)',x) # xml tags
 for part in parts: 
  if not part: #why needed? 
   pass 
  elif part.startswith('<') and part.endswith('>'):
   outarr.append(part)
  elif part.startswith('[Page') and part.endswith(']'):
   outarr.append(part)
  else: 
   # assume text in wx. Convert to slp1. Use specialized wx_slp1.xml
   y = transcoder.transcoder_processString(part,'wx','slp1')
   outarr.append(y)
 ans = ''.join(outarr)
 return ans
def r(text):
	#text1 = transcoder.transcoder_processString(text.decode('utf-8'),'deva','slp1')
	wordtype = wtd(text)
	text = transcoder.transcoder_processString(text,'deva','slp1')
	text = text.strip('.')
	url = 'http://sanskrit.inria.fr/cgi-bin/SKT/sktlemmatizer?lex=MW&q=' + text + '&t=SL&c=' + wordtype
	response = urllib2.urlopen(url)
	#print "webpage downloaded at ",
	#timestamp()
	html_doc = response.read()
	soup = BeautifulSoup(html_doc, 'html.parser')
	#print "soup made at ",
	#timestamp()
	interestingdiv = soup.find("div", { "class" : "center" })
	table = interestingdiv.find("table", { "class" : "yellow_cent" })
	span = table.tr.th.find("span", { "class" : "latin12" })
	data = unicode(span).split('<br>\n')[1]
	if wordtype not in ["Part", "Piic" ]:		
		verbattr_separator = unicode(data).split('}[')
		attributes = verbattr_separator[0]
		verbsoup = BeautifulSoup(verbattr_separator[1], 'html.parser')
		verb = verbsoup.a.text
		verb = re.sub("[0-9_]+", "", verb)
		verb = transcoder.transcoder_processString(verb,'roman','slp1')
		data = tosm(attributes)
		m = []
		if len(data) > 1:
			for datum in data:
				m.append(verb + '.' + datum)
			output = '|'.join(m)
		else:
			output = verb + '.' + data[0]
	elif wordtype in ["Part", "Piic" ]:
		output = kridantaattributes(data)
	return output
Beispiel #3
0
def key_transcode(m,fromcode,tocode):
 x1 = m.group(1)
 key1=m.group(2)
 x2 = m.group(3)
 key2=m.group(4)
 body=m.group(5)
 key1a = transcoder.transcoder_processString(key1,fromcode,tocode)
 key2a = transcoder.transcoder_processString(key2,fromcode,tocode)
 out = "<H1>%s{%s}%s{%s}%s" %(x1,key1a,x2,key2a,body)
 return out
Beispiel #4
0
def key_transcode(m,fromcode,tocode):
 x1 = m.group(1)
 key1=m.group(2)
 x2 = m.group(3)
 key2=m.group(4)
 body=m.group(5)
 key1a = transcoder.transcoder_processString(key1,fromcode,tocode)
 key2a = transcoder.transcoder_processString(key2,fromcode,tocode)
 out = "<H1>%s{%s}%s{%s}%s" %(x1,key1a,x2,key2a,body)
 return out
Beispiel #5
0
 def transcode(self,tranin,tranout):
  fullroot = transcoder_processString(self.fullroot,tranin,tranout)
  sense = self.sense
  if tranout == 'deva':
   sense = sense.replace('"','')
  sense = transcoder_processString(sense,tranin,tranout)
  L = self.L
  code = self.code
  ans = 'fullroot=%s, sense=%s, L=%s, mwcode=%s' %(
         fullroot,sense,L,code)
  return ans
Beispiel #6
0
def alterations(filein,fileout):
	fin = codecs.open(filein,'r','utf-8')
	data = fin.read()
	fin.close()
	data = data.strip()
	print 'making preprocess changes'
	data = changelist(data)
	print "Debugging and writing to log.txt"
	log = codecs.open('log.txt','a','utf-8')
	log.write('#'+filein+"#\n")
	words = data.split(' ')
	counter=1
	out = []
	for i in xrange(len(words)):
		word = words[i]
		word = snchanges(word)
		# Creating log for श ङ issue. See https://github.com/drdhaval2785/padamanjari/issues/1
		"""
		if re.search(r'\s["][sn]',word):
			changed = snchanges(word)
			#log.write(str(counter)+":"+word+"\n")
			counter = counter+1
			if not changed == word:
				out.append(changed)
			else:
				out.append(word)
		# Creating log for ङ issue. See https://github.com/drdhaval2785/padamanjari/issues/2
		if re.search(r'"n[^aAiIuUfFxXeEoOykglnm]',word):
			out.append(word)
			rep = word.replace('\n',' ')
			log.write(str(counter)+":"+rep+"\n")
			counter = counter+1
		else:
			out.append(word)
		"""
		out.append(word)
	data = ' '.join(out)
	log.close()
	print 'changing to slp1'
	output = transcoder.transcoder_processString(data,'vel','slp1')
	#fout1 = codecs.open(fileout,'w','utf-8')
	#fout1.write(output)
	#fout1.close()
	output = slpchanges(output)
	print 'changing to Devanagari'
	output = transcoder.transcoder_processString(output,'slp1','deva')
	output = output.replace('#','')
	#output = output.replace('\n','<br/>')
	print 'putting the data in output folder'
	fout1 = codecs.open(fileout,'w','utf-8')
	fout1.write(output)
	fout1.close()
Beispiel #7
0
def alterations(filein, fileout):
    fin = codecs.open(filein, 'r', 'utf-8')
    data = fin.read()
    fin.close()
    data = data.strip()
    print 'making preprocess changes'
    data = changelist(data)
    print "Debugging and writing to log.txt"
    log = codecs.open('log.txt', 'a', 'utf-8')
    log.write('#' + filein + "#\n")
    words = data.split(' ')
    counter = 1
    out = []
    for i in xrange(len(words)):
        word = words[i]
        word = snchanges(word)
        # Creating log for श ङ issue. See https://github.com/drdhaval2785/padamanjari/issues/1
        """
		if re.search(r'\s["][sn]',word):
			changed = snchanges(word)
			#log.write(str(counter)+":"+word+"\n")
			counter = counter+1
			if not changed == word:
				out.append(changed)
			else:
				out.append(word)
		# Creating log for ङ issue. See https://github.com/drdhaval2785/padamanjari/issues/2
		if re.search(r'"n[^aAiIuUfFxXeEoOykglnm]',word):
			out.append(word)
			rep = word.replace('\n',' ')
			log.write(str(counter)+":"+rep+"\n")
			counter = counter+1
		else:
			out.append(word)
		"""
        out.append(word)
    data = ' '.join(out)
    log.close()
    print 'changing to slp1'
    output = transcoder.transcoder_processString(data, 'vel', 'slp1')
    #fout1 = codecs.open(fileout,'w','utf-8')
    #fout1.write(output)
    #fout1.close()
    output = slpchanges(output)
    print 'changing to Devanagari'
    output = transcoder.transcoder_processString(output, 'slp1', 'deva')
    output = output.replace('#', '')
    #output = output.replace('\n','<br/>')
    print 'putting the data in output folder'
    fout1 = codecs.open(fileout, 'w', 'utf-8')
    fout1.write(output)
    fout1.close()
Beispiel #8
0
def unused_convertrecs(recs,tranin,tranout):
 "Modifies recs"
 n=0
 for rec in recs:
  n=n+1
  try:
   rec.abbrvunicode = transcoder.transcoder_processString(rec.abbrv,tranin,tranout)
   rec.titleunicode = transcoder.transcoder_processString(rec.title,tranin,tranout)
   m = re.search(r'[a-zA-Z][1-9]',rec.abbrvunicode + " " + rec.titleunicode )
   if m:
    print "TRANSCODER WARNING: ",m.group(0).encode('utf-8')
  except:
   print "convertrecs problem",n,rec.line.encode('utf-8')
Beispiel #9
0
def linking(fin,fout):
	infile = codecs.open(fin,'r','utf-8')
	input = infile.readlines()
	input = triming(input)
	outfile = codecs.open(fout,'w','utf-8')
	#acc:akzoByatantre,41695:akzoByatantre:n:oBy -> acc:अक्षोभ्यतन्त्रे,41695:अक्षोभ्यतन्त्रे:n:oBy
	for line in input:
		[dict,headword,replica,errcode,note] = line.split(':')
		[hw,lnum] = headword.split(',')
		hw = transcoder.transcoder_processString(hw,'slp1','deva')
		note = transcoder.transcoder_processString(note,'slp1','deva')
		outfile.write(dict+':'+hw+','+lnum+':'+hw+':'+errcode+':'+note+'\n')
	outfile.close()
	print "Check", fout, "for testing"
Beispiel #10
0
 def transcode(self,tranin,tranout):
  fullroot = transcoder_processString(self.fullroot,tranin,tranout)
  sense = self.sense
  if tranout == 'deva':
   sense = sense.replace('"','')
  sense = transcoder_processString(sense,tranin,tranout)
  #othrrootstr = transcoder_processString(self.othrrootstr,tranin,tranout)
  sid = self.sid
  code = self.code
  #ans = 'fullroot=%s, sense=%s, sid=%s, othrroots=%s, mwcode=%s' %(
  #  fullroot,sense,sid,othrrootstr,code)
  ans = 'fullroot=%s, sense=%s, sid=%s, mwcode=%s' %(
    fullroot,sense,sid,code)
  return ans
Beispiel #11
0
def convertrecs(recs,tranin,tranout):
 "Modifies recs"
 n=0
 for rec in recs:
  n=n+1
  try:
   rec.abbrvunicode = transcoder.transcoder_processString(rec.abbrv,tranin,tranout)
   rec.titleunicode = transcoder.transcoder_processString(rec.title,tranin,tranout)
   m = re.search(r'[a-zA-Z][1-9]',rec.abbrvunicode + " " + rec.titleunicode )
   if m:
    print "TRANSCODER WARNING: ",m.group(0).encode('utf-8')
   # Undo some transcodings
   rec.titleunicode = re.sub(r'YOLLY','JOLLY',rec.titleunicode)  # JOLLY is an author
  except:
   print "convertrecs problem",n,rec.line.encode('utf-8')
Beispiel #12
0
def unused_adjust_hk(m):
 x = m.group(1)
 # re.split(r'(<[^>]+>)',s)(&.*;)
 outarr = []
 parts = re.split(r'(<[^>]+>)',x) # xml tags
 for part in parts: 
  if (part == ''):
   pass
  elif (part[0] == '<'):
   outarr.append(part)
  else:
   parts1 = re.split(r'(&.*;)',part) # xml entity
   for part1 in parts1:
    if (part1 == ''):
     pass
    elif (part1[0] == '&'):
     outarr.append(part1)
    else: # assume text in hk. Convert to slp1
     z = re.sub(r'\|','.',part1) # text has non-standard | for danda
     if z == 'oMM':
      y = 'o~' # OM
     else:
      y = transcoder.transcoder_processString(z,'hk','slp1')
     outarr.append(y)
 ans = ''.join(outarr)
 return "<s>%s</s>" % ans
Beispiel #13
0
def abbrv_transcode(p):
 tranin = 'as'
 tranout = 'roman1'
 proman = transcoder.transcoder_processString(p,tranin,tranout)
 # correct some errors:
 proman = proman.replace('Yourn','Journ')
 return proman
def generator(analysedword, translit="slp1"):
    analysedword = unicode(analysedword)  # unicode
    data = re.split(
        '|', analysedword
    )  # There may be cases where the data may have been analysed by our analyser. They would be separated by '|'.
    for datum in data:
        separate = re.split('-', datum)  # split the whole string by '-'
        rootword = separate[0]  # Base word
        taglist = separate[1:]  # attributes
        if taglist[-1] in [
                '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'
        ] and taglist[-2] in ['verbgana', 'aoristgana', 'injunctivegana']:
            taglist = taglist[:-2]  # Removed artificially added attributes
        datahavingroot = findrootword(
            rootword)  # Created a list of possible items
        outlist = []
        for rootdatum in datahavingroot:
            if set(taglist) < set(
                    rootdatum
            ):  # If the tags supplied are a subset of the data from XML file,
                outlist.append(rootdatum[-1])  # Add the word form to outlist
        if translit == "deva":
            return transcoder.transcoder_processString("|".join(outlist),
                                                       'slp1',
                                                       'deva')  # Devanagari
        else:
            return "|".join(outlist)  # SLP1
Beispiel #15
0
def disp_md(dictcode,icase,L,hw0,url,page0,datalines):
 """ return array of lines, formatted for details of GitHub Markdown
 """
 outarr=[]
 pageref = "[page %s](%s)" %(page0,url)
 outarr.append(' Case %04d: %s  %s ' % (icase,hw0,pageref))
 datalines = adjust_datalines(dictcode,datalines)
 # output up to 10 lines of datalines
 outlines = datalines[0:10]
 outarr.append('```')
 # construct potential headword change record
 out = "%s:%s,%s:%s:n:" %(dictcode,hw0,L,hw0)
 outarr.append(out)
 outarr.append('')
 for x in outlines:
  # Remove '|', which is a line-separator in CAE
  x = re.sub(r'[|]','',x)
  y = transcoder.transcoder_processString(x,'as','roman')
  if (y.strip() != ''):
   outarr.append('%s' % y)
 if len(datalines)>10:
  ndiff = len(datalines) - 10
  outarr.append('  [and %s more lines]' % ndiff)
 outarr.append('```')
 outarr.append('------------------------------------------')
 outarr.append('')
 return outarr
Beispiel #16
0
def transcode(x,tranout='slp1'):
 """ transcode from slp1 to tranout, unless line starts with ';'
 """
 if x.startswith(';'):
  return x
 else:
  return transcoder.transcoder_processString(x,'slp1',tranout)
Beispiel #17
0
def jnutrimline(a, b):
    parts = b.split('#')
    gana1 = parts[1].split(',')[2]
    # Convert from gana name to gana number.
    gana = gananametonumber(gana1)
    return transcoder.transcoder_processString(parts[0], 'deva',
                                               'slp1') + ':' + gana + ':' + a
Beispiel #18
0
def getbasengrams(forThisBook, nth):
    booklist = [
        'balamanorama', 'kashika', 'laghu', 'nyasa', 'samhita', 'tattvabodhini'
    ]
    padalist = [
        'pada-1.1', 'pada-1.2', 'pada-1.3', 'pada-1.4', 'pada-2.1', 'pada-2.2',
        'pada-2.3', 'pada-2.4', 'pada-3.1', 'pada-3.2', 'pada-3.3', 'pada-3.4',
        'pada-4.1', 'pada-4.2', 'pada-4.3', 'pada-4.4', 'pada-5.1', 'pada-5.2',
        'pada-5.3', 'pada-5.4', 'pada-6.1', 'pada-6.2', 'pada-6.3', 'pada-6.4',
        'pada-7.1', 'pada-7.2', 'pada-7.3', 'pada-7.4', 'pada-8.1', 'pada-8.2',
        'pada-8.3', 'pada-8.4'
    ]
    result = set()
    for book in booklist:
        print book
        if book == forThisBook:
            pass
        else:
            for pada in padalist:
                inputdir = '../../' + book + '/' + pada
                inputfiles = glob.glob(inputdir + '/*.*')
                print inputdir
                for inputfile in inputfiles:
                    fin = codecs.open(inputfile, 'r', 'utf-8')
                    data = fin.read()
                    text = data.split('---')[2].strip()
                    text = transcoder.transcoder_processString(
                        text, 'deva', 'slp1')
                    text = re.sub('[^a-zA-Z \']+', '', text)
                    result = result.union(getngrams(text.encode('utf-8'), nth))
                    fin.close()
                print len(result), nth, 'gram'
    #print result
    return result
def adv(text):
	input = text.split('.')
	errormessage = 'not found as a'
	if input[1] == 'adv':
		url = 'http://sanskrit.inria.fr/cgi-bin/SKT/sktlemmatizer?lex=MW&q=' + input[0] + '&t=SL&c=Advb'
		response = urllib2.urlopen(url).read()
		if errormessage not in response:
			return transcoder.transcoder_processString(input[0],'slp1','deva')
def simpleslp1(word):
    """ Apply slp1_simpleslp1 transcoder. 
  lower case all letters in word, EXCEPT Y (palatal nasal) and
  R (cerebral nasal) -- Y and R are changed to 'n' in transcoder.
  Also, replace a doubled letter by the single letter.
 """
    word1 = simple_lower(word)
    word2 = remove_double(word1)
    ans1 = transcoder.transcoder_processString(word2, 'slp1', 'simpleslp1lo')
    ans = [ans1]
    if 'f' in word2:
        # Handle other forms of 'f':  ri,ru,ar
        for altf in ['ri', 'ru', 'ar']:
            word3 = re.sub('f', altf, word2)
            ansf = transcoder.transcoder_processString(word3, 'slp1',
                                                       'simpleslp1lo')
            ans.append(ansf)
    # allow either 'm' or 'n' before consonant
    a1 = mn_consonants(ans, 'm', 'n')  # change mC to nC (C = consonant)
    a2 = mn_consonants(ans, 'n', 'm')
    ans = ans + a1 + a2
    if 'kxp' in word2:
        # Handle other forms of 'x':  l and also lr, lri,
        for altf in ['klrp', 'klrip', 'klrup', 'kalp']:
            word3 = re.sub('kxp', altf, word2)
            ansx = transcoder.transcoder_processString(word3, 'slp1',
                                                       'simpleslp1lo')
            ans.append(ansx)
    if re.search(r'ar$', ans1):
        # cases like pw: kar <-> kf.
        # This is aimed at verbs only, but the code will catch words
        # ending in punar
        for altf in ['ri', 'ru', 'r']:
            x = re.sub(r'ar$', altf, ans1)
            if x not in ans:
                ans.append(x)
    # special case of 'kalp' verb (in pw, etc) == kxp
    if ans1 == 'kalp':
        for alt in ['klp', 'klrp', 'klrip']:
            x = re.sub('kalp$', alt, ans1)
            if x not in ans:
                ans.append(x)
    # Choose to add grammar variants
    # in the query
    return ans
    """
def iter(wordxml, strength="Full"):
    if wordxml == "????":
        return "????"  # Error message
    else:
        wordxml = unicode(wordxml)  # Converted the word to unicode
        wordwithtags = []  # Empty list
        individualentries = wordxml.split("|")
        for individualentry in individualentries:
            tree = StringIO(individualentry)  # Created XML from the worddata
            # print "parsing of iter started at", printtimestamp()
            context = etree.parse(tree)  # Parsed the element tree.
            # print "parsing of iter ended at", printtimestamp()
            root = context.getroot()  # got the root of element tree e.g. 'f'
            # The next two steps require explanation. In Gerard's XML files, All possible attributes are given as children of 'f'. The last child is always 's' which stores the stem. All other children are the various possible word attributes. Given as 'na' or 'v' etc. Gio
            children = root.getchildren()[:-1]  # attributes
            basedata = root.getchildren()[-1]  # 's' stem
            basewordslp = basedata.get("stem").strip()  # Base word in SLP1 encoding.
            if strength == "deva":
                baseword = transcoder.transcoder_processString(
                    basewordslp, "slp1", "deva"
                )  # If the user wants output in Devanagari rather than SLP1, this code converts it to Devanagari.
            else:
                baseword = basewordslp  # Otherwise in SLP1.
            attributes = []  # An empty list to store attributes.
            for child in children:
                taglist = child.xpath(
                    ".//*"
                )  # Fetches all elements (abbreviations) of a particular verb / word characteristics.
                output = [child.tag]  # The first member of output list is the tag of element 'v', 'na' etc.
                output = output + [
                    tagitem.tag for tagitem in taglist
                ]  # Other tags (abbreviations) and add it to output list.
                # The following section is commented out right now. But it would be needed for situation where we need to konw the gaNa of a verb or 7 kinds of aorist derivation.
                """if len(child.xpath('.//prs[@gn]')) > 0:
					prsgana = child.xpath('.//prs')[0].get('gn')
					output.append('verbgana')
					output.append(prsgana)
				elif len(child.xpath('.//aor[@gn]')) > 0:
					aorgana = child.xpath('.//aor')[0].get('gn')
					output.append('aoristgana')
					output.append(aorgana)
				elif len(child.xpath('.//inj[@gn]')) > 0:
					injgana = child.xpath('.//inj')[0].get('gn')
					output.append('injunctivegana')
					output.append(injgana)"""
                attributes.append(output)  # output list is appended to attributes list.
            if strength == "deva":
                outputlist = converttodevanagari(attributes)  # Devanagari
            else:
                outputlist = attributes  # SLP1
            for member in outputlist:
                wordwithtags.append(
                    baseword + "-" + "-".join(member)
                )  # Created a list wordwithtags where the first member is baseword and the rest of the members are attributes separated by '-'
                # print "postprocessing of iter ended at", printtimestamp()
        return "|".join(
            wordwithtags
        )  # If there are more than one possible verb characteristics for a given form, they are shown separated by a '|'
def convertfromfile(inputfile, outputfile):
    f = codecs.open(inputfile, "r", "utf-8")  # Opened inputfile with UTF-8 encoding.
    data = f.readlines()  # Read the lines into a list.
    f.close()  # Closed the inputfile.
    g = codecs.open(outputfile, "w", "utf-8")  # Opened the outputfile with UTF-8 encoding.
    for datum1 in data:  # For each member of data,
        datum1 = datum1.strip()  # Removed unnecessary whitespaces.
        datum1 = transcoder.transcoder_processString(datum1, "deva", "slp1")  # Converted from Devanagari to SLP1.
        dat = re.split("(\W+)", datum1)  # Created a word list by exploding the sentence at word boundaries.
        for i in xrange(len(dat)):
            datum = dat[i].strip()  # Clean whitespaces.
            if i % 2 == 0 and i != len(
                dat
            ):  # Even members of datum are the words and odd members are word boundaries. Therefore, processing only even members.
                # print "analysis of word started", printtimestamp()
                x = devanagaridisplay(datum)  # Analysed the even members.
                # print "analysis of word ended", printtimestamp()
                g.write(
                    transcoder.transcoder_processString(datum, "slp1", "deva") + "(" + x + ")"
                )  # Wrote to the outputfile.
                print transcoder.transcoder_processString(
                    datum, "slp1", "deva"
                ) + "(" + x + ")"  # printed to the screen for the user.
                # print "wrote to the file", printtimestamp()
            else:
                g.write(
                    transcoder.transcoder_processString(dat[i], "slp1", "deva")
                )  # For odd members, converted the word boundaries to their Devanagari counterparts.
                print transcoder.transcoder_processString(
                    dat[i], "slp1", "deva"
                )  # For odd members, converted the word boundaries to their Devanagari counterparts.
        g.write("\n")  # Newline character added
        print  # Newline character printed on terminal.
    g.close()  # Closed outputfile.
def convertline(line,tranfrom,tranto):
 """ 
 """
 parts=line.split('@')
 # 4th part is the part to convert
 if tranfrom == 'roman2':
  parts[4] = parts[4].lower()
 parts[4] = transcoder.transcoder_processString(parts[4],tranfrom,tranto)
 return '@'.join(parts)
def iter(wordxml, strength="Full"):
    wordxml = unicode(wordxml)  # Converted the word to unicode
    wordwithtags = []  # Empty list
    individualentries = wordxml.split('|')
    for individualentry in individualentries:
        tree = StringIO(individualentry)  # Created XML from the worddata
        context = etree.parse(tree)  # Parsed the element tree.
        root = context.getroot()  # got the root of element tree e.g. 'f'
        # The next two steps require explanation. In Gerard's XML files, All possible attributes are given as children of 'f'. The last child is always 's' which stores the stem. All other children are the various possible word attributes. Given as 'na' or 'v' etc. Gio
        children = root.getchildren()[:-1]  # attributes
        basedata = root.getchildren()[-1]  # 's' stem
        basewordslp = basedata.get(
            'stem').strip()  # Base word in SLP1 encoding.
        if strength == "deva":
            baseword = transcoder.transcoder_processString(
                basewordslp, 'slp1', 'deva'
            )  # If the user wants output in Devanagari rather than SLP1, this code converts it to Devanagari.
        else:
            baseword = basewordslp  # Otherwise in SLP1.
        attributes = []  # An empty list to store attributes.
        for child in children:
            taglist = child.xpath(
                './/*'
            )  # Fetches all elements (abbreviations) of a particular verb / word characteristics.
            output = [
                child.tag
            ]  # The first member of output list is the tag of element 'v', 'na' etc.
            output = output + [
                tagitem.tag for tagitem in taglist
            ]  # Other tags (abbreviations) and add it to output list.
            # The following section is commented out right now. But it would be needed for situation where we need to konw the gaNa of a verb or 7 kinds of aorist derivation.
            """if len(child.xpath('.//prs[@gn]')) > 0:
				prsgana = child.xpath('.//prs')[0].get('gn')
				output.append('verbgana')
				output.append(prsgana)
			elif len(child.xpath('.//aor[@gn]')) > 0:
				aorgana = child.xpath('.//aor')[0].get('gn')
				output.append('aoristgana')
				output.append(aorgana)
			elif len(child.xpath('.//inj[@gn]')) > 0:
				injgana = child.xpath('.//inj')[0].get('gn')
				output.append('injunctivegana')
				output.append(injgana)"""
            attributes.append(
                output)  # output list is appended to attributes list.
        if (strength == "deva"):
            outputlist = converttodevanagari(attributes)  # Devanagari
        else:
            outputlist = attributes  # SLP1
        for member in outputlist:
            wordwithtags.append(
                baseword + "-" + "-".join(member)
            )  # Created a list wordwithtags where the first member is baseword and the rest of the members are attributes separated by '-'
    return "|".join(
        wordwithtags
    )  # If there are more than one possible verb characteristics for a given form, they are shown separated by a '|'
Beispiel #25
0
def convert4(datain, fileout, tranin, tranout):
    body = datain
    body1 = transcoder.transcoder_processString(body, tranin, tranout)
    with codecs.open(fileout, "w", 'utf-8') as f:
        f.write('%s\n' % body1)
    #y = "%s %s" % (head,body1)
    #fpout.write("%s\n" % y)
    #fp.close()
    #fpout.close()
    print "fileout=", fileout
Beispiel #26
0
def dev(file):
	f = codecs.open(file, 'r+', 'utf-8-sig')
	data = f.read()
	data = transcoder.transcoder_processString(data,'slp1','deva')
	data = re.sub(u'ळ्ह्', '|', data)
	f.close()
	g = codecs.open("hindidevanagariverbform.txt", "w+", "utf-8-sig")
	g = codecs.open("skd_deva.txt", "w+", "utf-8-sig")
	g.write(data)
	g.close()
Beispiel #27
0
def convertline(line,tranfrom,tranto):
 """ do transcoder, but don't convert [Page...]
 """
 parts=line.split('[Page')
 parts[0] = transcoder.transcoder_processString(parts[0],tranfrom,tranto)
 if re.search(r'[a-zA-Z][0-9]',parts[0]):
  unconverted=True
 else:
  unconverted=False
 return (unconverted,'[Page'.join(parts))
def as2slp1(x):
 y = re.sub(r'[ +.;-]','',x)
 y = re.sub(r',+$','',y)
 y = re.sub(r'\(\?\)','',y)
 y = re.sub(r'\(=.*?\)','',y)
 y = re.sub(r'\(.*?\)$','',y)
 y = re.sub(r'=.*$','',y)  # represent variant
 y = re.sub(r',.*$','',y)
 y = y.lower()  # BURFEY represents IAST of verbs in capital letters
 z = transcoder.transcoder_processString(y,'as','slp1')
 return z
Beispiel #29
0
 def __init__(self,line):
  line = line.rstrip('\r\n')
  m = re.search(r'^<e>.*<in>(.*?)</in> <out>(.*?)</out>',line)
  if not m:
   self.status = False
   #print('SLP1 skip:',line)
   return
  self.status = True
  self.slp1 = m.group(1)
  self.romanraw = m.group(2)
  self.roman = transcoder.transcoder_processString(self.slp1,"slp1","roman")
Beispiel #30
0
def output(f, tranin, tranout, body):
    body1 = transcoder.transcoder_processString(body, tranin, tranout)
    f.write('%4s: %s\n' % (tranin, body))
    f.write('%s %s\n' % (tranout, body1))
    outarr = [repr(c) for c in body1]
    out = ' '.join(outarr)
    f.write('unic: %s\n' % out)
    names = [unicodedata.name(c) for c in body1]
    out = ','.join(names)
    f.write('    : %s\n' % out)
    f.write('\n')
def convertline(line,tranfrom,tranto):
 """ do transcoder, for the 
 """
 parts=line.split('@')
 # 4th part is the part to convert
 parts[4] = transcoder.transcoder_processString(parts[4],tranfrom,tranto)
 if re.search(r'[a-zA-Z][0-9]',parts[4]):
  unconverted=True
 else:
  unconverted=False
 return (unconverted,'@'.join(parts))
Beispiel #32
0
def dev1(file):
	f = codecs.open(file, 'r+', 'utf-8-sig')
	g = codecs.open("skd_deva.txt", "w+", "utf-8-sig")
	data = f.readlines()
	for datum in data:
		datum = transcoder.transcoder_processString(datum,'slp1','deva')
		datum = re.sub(u'ळ्ह्', '|', datum)
		g.write(datum)
		print datum
	g.close()
	f.close()
Beispiel #33
0
 def toString(self):
  outarr = []
  try:
   outarr.append(self.authrec.cologneid)
  except:
   print "Link.toString error:",self.line.encode('utf-8')
   exit(1)
  outarr.append(self.linkkey)
  authkey1 = self.authrec.authabbrev()
  linkkey1 = transcoder.transcoder_processString(self.linkkey,'as1','roman')
  # transcode the same way it is done for ls in 
  # correctionwork/cologne-issue-216
  linkkey2a = transcoder.transcoder_processString(self.linkkey,'asls','iast')
  linkkey2 = transcoder.transcoder_processString(linkkey2a,'iast','iast1')
  outarr.append(linkkey2)
  outarr.append(linkkey1)
  outarr.append(authkey1)
  outarr.append(self.authrec.toString())
  outarr.append(self.print_type())
  out = '\t'.join(outarr)
  return out
def wtd(text):
	#text = text.decode('utf-8')
	text = transcoder.transcoder_processString(text,'deva','slp1')
	wordtype = ['Noun', 'Voca', 'Verb', 'Pron', 'Part', 'Advb', 'Abso', 'Iic', 'Ifc', 'Iiv', 'Piic']
	errormessage = 'not found as a'
	for wordt in wordtype:
		url = 'http://sanskrit.inria.fr/cgi-bin/SKT/sktlemmatizer?lex=MW&q=' + text + '&t=SL&c=' + wordt
		response = urllib2.urlopen(url).read()
		if errormessage in response:
			pass
		else:
			return wordt
Beispiel #35
0
def simpleslp1(word):
    """ Apply slp1_simpleslp1 transcoder. 
  lower case all letters in word, EXCEPT Y (palatal nasal) and
  R (cerebral nasal) -- Y and R are changed to 'n' in transcoder.
  Also, replace a doubled letter by the single letter.
 """
    def sub1(m):
        a = m.group(1)
        return a.lower()

    regex1 = '([AIUFXEOMHKGNCJWQTDPBLVSZ])'
    word1 = re.sub(regex1, sub1, word)
    regex2 = r'(.)\1'

    def sub2(m):
        a = m.group(0)  # xx
        return a[0]  # x

    word2 = re.sub(regex2, sub2, word1)
    var = transcoder.transcoder_processString(word2, 'slp1', 'simpleslp1lo')
    #if word != word2:
    # if word.startswith('kar'):
    #  print('dbg:',word,word1,word2,var)
    ans = [var]
    #if not re.search(r'(ar|ri|ru)
    # sometimes an 'ar' slp1 might also be slp1 vowel 'f'.
    # probably when NOT followed by a vowel
    #   (i.e. at end or followed by consonant)
    regex3 = r'(ar)([^aiufeo]|$)'

    def sub3(m):
        return 'r' + m.group(2)

    word3 = re.sub(regex3, sub3, var)
    #if True and (word3 != var):
    # print('dbg:',word,word1,word2,word3,var)
    if word3 != var:
        ans.append(word3)
    # sometimes, ri should be interpreted as 'f'
    # when (a) at beginning or not preceded by a vowel or followed by vowel
    regex4 = r'(^|[^aiufeo])ri([^aiufeo]|$)'

    def sub4(m):
        return m.group(1) + 'r' + m.group(2)  # drop r in ri

    word4 = re.sub(regex4, sub4, word3)
    if word4 != word3:
        ans.append(word4)
        if True:
            print('dbg:', word, word1, word2, var, word3, word4)
    return ans
Beispiel #36
0
def write(option,fileout,mergerecs,tranout,name1,name2):
 tranin = 'slp1'
 n = 0
 nflag = 0
 neq = 0
 with codecs.open(fileout,"w","utf-8") as f:
  for imerge,mergerec in enumerate(mergerecs):
   rec1,rec2 = mergerec
   outarr1 = []
   outarr2 = []
   flagok = True
   if (rec1 == None) or (rec2 == None):
    flagok = False
   if (rec1 != None) and (rec1.k == '?'):
    flagok = False
   if (rec2 != None) and (rec2.k == '?'):
    flagok = False
   if (option == 1) and (not flagok):
    # skip this problem merged record
    continue
   if (option == 2) and flagok:
    # skip this non-problem merged record
    continue
   n = n + 1
   if rec1 == None:
    out1 = '?'
    outarr1.append('%s: %s' %(name1,out1))
   else:
    out1 = rec1.k
    k = rec1.k
    for r in rec1.x:
     rstr = r.transcode(tranin,tranout)
     outarr1.append('%s: %s' %(name1,rstr))
     assert k == r.mw
   if rec2 == None:
    out2 = '?'
    outarr2.append('%s: %s' %(name2,out2))
   else:
    out2 = rec2.k
    k = rec2.k
    for r in rec2.x:
     rstr = r.transcode(tranin,tranout)
     outarr1.append('%s: %s' %(name2,rstr))
     assert k == r.mw
   outarr = []
   kstr = transcoder_processString(k,tranin,tranout)
   outarr.append('; Case %04d: mw = %s' %(n,kstr))
   outarr = outarr + outarr1 + [';'] + outarr2 + [';']
   for out in outarr:
    f.write(out + '\n')
 print(n,"records written to",fileout)
Beispiel #37
0
def preparation(inputfile,translit='deva'):
	infile = codecs.open(inputfile,'r','utf-8')
	inputwords = infile.read().split()
	inputwords = triming(inputwords)
	output = []
	for word in inputwords:
		word = transcoder.transcoder_processString(word,'deva','slp1')
		if re.search('[^A-Za-z]',word):
			word = re.sub('[^A-Za-z]','',word)
			if not word == '':
				output.append(word)
		else:
			output.append(word)
	return output
Beispiel #38
0
def getSKngrams(nth):
    result = set()
    fin = codecs.open('../../../siddhantakaumudi/sk1.txt', 'r', 'utf-8')
    for text in fin:
        text = re.sub(u'^[{][#]उ[0-9]+[#][}]', '', text)
        text = text.replace(u'(अ)', '')
        text = text.replace(u'(स्व)', '')
        text = transcoder.transcoder_processString(text, 'deva', 'slp1')
        text = re.sub(u'[^a-zA-Z \']+', ' ', text)
        text = re.sub('[ ]+', ' ', text)
        result = result.union(getngrams(text.encode('utf-8'), nth))
    fin.close()
    print len(result), nth, 'gram'
    return result
def getSKngrams(nth):
	result = set()
	fin = codecs.open('../../../siddhantakaumudi/sk1.txt','r','utf-8')
	for text in fin:
		text = re.sub(u'^[{][#]उ[0-9]+[#][}]','',text)
		text = text.replace(u'(अ)','')
		text = text.replace(u'(स्व)','')
		text = transcoder.transcoder_processString(text,'deva','slp1')
		text = re.sub(u'[^a-zA-Z \']+',' ',text)
		text = re.sub('[ ]+',' ',text)
		result = result.union(getngrams(text.encode('utf-8'),nth))
	fin.close()
	print len(result), nth, 'gram'
	return result	
Beispiel #40
0
def preparation(inputfile,translit='deva'):
	infile = codecs.open(inputfile,'r','utf-8')
	inputwords = infile.read().split()
	inputwords = triming(inputwords)
	output = []
	for word in inputwords:
		word = transcoder.transcoder_processString(word,'deva','slp1')
		if re.search('[^A-Za-z]',word):
			word = re.sub('[^A-Za-z]','',word)
			if not word == '':
				output.append(word)
		else:
			output.append(word)
	return output
Beispiel #41
0
def unused_as2slp1_systematic(x):
 y = re.sub(r'-','',x)
 # nasals
 y = re.sub(r'n3([kg])',r'm3\1',y)
 y = re.sub(r'n5([cj])',r'm3\1',y)
 y = re.sub(r'm([pbm])',r'm3\1',y)
 y = re.sub(r'n([tdn])',r'm3\1',y)
 # visarga
 y = re.sub(r'ss','h2s',y)
 # alternate vant/vat  or mant/mat
 y = re.sub(r'va\(n$','vat',y)
 y = re.sub(r'ma\(n$','mat',y)

 z = transcoder.transcoder_processString(y,'as','slp1')
 return z
Beispiel #42
0
def adjust_slp1(x):
 # modfied to return wx
 m = re.search(r'^<(/?)(.*?)>$',x)
 if m:
  x1 = m.group(1)
  x2 = m.group(2)
  y2 =  transcoder.transcoder_processString(x2,'slp1','wx')
  ans = "<%s%s>" %(x1,y2)
  return ans
 outarr = [] # wx
 parts = re.split(r'(<[^>]+>)|(\[Page.*?\])',x) # xml tags
 for part in parts: 
  if not part: #why needed? 
   pass 
  elif part.startswith('<') and part.endswith('>'):
   outarr.append(part)
  elif part.startswith('[Page') and part.endswith(']'):
   outarr.append(part)
  else: 
   # assume text in wx. Convert to slp1. Use specialized wx_slp1.xml
   y = transcoder.transcoder_processString(part,'slp1','wx')
   outarr.append(y)
 ans = ''.join(outarr)
 return ans
Beispiel #43
0
def convertfromfile(inputfile,outputfile):
	f = codecs.open(inputfile, 'r', 'utf-8') # Opened inputfile with UTF-8 encoding.
	data = f.readlines() # Read the lines into a list.
	f.close() # Closed the inputfile.
	g = codecs.open(outputfile, 'w', 'utf-8') # Opened the outputfile with UTF-8 encoding.
	for datum1 in data: # For each member of data,
		datum1 = datum1.strip() # Removed unnecessary whitespaces.
		datum1 = transcoder.transcoder_processString(datum1, "deva", "slp1") # Converted from Devanagari to SLP1.
		dat = re.split('(\W+)',datum1) # Created a word list by exploding the sentence at word boundaries.
		for i in xrange(len(dat)):
			datum = dat[i].strip() # Clean whitespaces.
			if i % 2 == 0 and i != len(dat)-1: # Even members of datum are the words and odd members are word boundaries. Therefore, processing only even members. 
				#print "analysis of word started", timestamp()
				x = devanagaridisplay(datum) # Analysed the even members.
				#print "analysis of word ended", timestamp()
				g.write(transcoder.transcoder_processString(datum, "slp1", "deva")+"("+x+")") # Wrote to the outputfile.
				print datum, timestamp()
				#print transcoder.transcoder_processString(datum, "slp1", "deva")+"("+x+")" # printed to the screen for the user.
				#print "wrote to the file", timestamp()
			else:
				g.write(transcoder.transcoder_processString(dat[i], "slp1", "deva")) # For odd members, converted the word boundaries to their Devanagari counterparts.
		g.write('\n') # Newline character added
		print # Newline character printed on terminal.
	g.close() # Closed outputfile.
Beispiel #44
0
def unused_transcode_line(x,tranin,tranout):
 """ 
 """
 if re.search(r'^\[Page.*?\]$',x):
  return x
 parts = re.split(r'(<[^>]*>)',x)
 newparts = []
 for part in parts:
  if part.startswith('<'):
   newparts.append(part)
  else:
   newpart = transcoder.transcoder_processString(part,tranin,tranout)
   newparts.append(newpart)
 y = ''.join(newparts)
 return y
Beispiel #45
0
 def toString(self):
     outarr = []
     try:
         outarr.append(self.authrec.cologneid)
     except:
         print "Link.toString error:", self.line.encode('utf-8')
         exit(1)
     outarr.append(self.linkkey)
     authkey1 = self.authrec.authabbrev()
     linkkey1 = transcoder.transcoder_processString(self.linkkey, 'as1',
                                                    'roman')
     # transcode the same way it is done for ls in
     # correctionwork/cologne-issue-216
     linkkey2a = transcoder.transcoder_processString(
         self.linkkey, 'asls', 'iast')
     linkkey2 = transcoder.transcoder_processString(linkkey2a, 'iast',
                                                    'iast1')
     outarr.append(linkkey2)
     outarr.append(linkkey1)
     outarr.append(authkey1)
     outarr.append(self.authrec.toString())
     outarr.append(self.print_type())
     out = '\t'.join(outarr)
     return out
def main(inlines,hwrecs,fileout,fileout1):
 fout=codecs.open(fileout,"w","utf-8")
 fout1=codecs.open(fileout1,"w","utf-8")
 nsystematic=0
 nout=0
 for hwrec in hwrecs:
  datalines = inlines[hwrec.linenum1-1:hwrec.linenum2]
  # is it a foreign word? If so, get list of languages.
  fw = foreignword(datalines) 
  if len(fw) == 0:
   continue
  firstline = datalines[0] 
  page0 = hwrec.pagecol
  l1 = hwrec.linenum1
  l2 = hwrec.linenum2
  hw0 = hwrec.hwslp
  nout = nout + 1
  dictcode='ieg'
  # output to fileout
  out = "%s:%s:foreign %s" %(dictcode,hw0,','.join(fw))
  fout.write("%s\n" % out)
  # output to fileout1
  outarr=[]
  baseurl='http://www.sanskrit-lexicon.uni-koeln.de/scans/awork/apidev/servepdf.php?dict=%s'% dictcode
  url = '%s&page=%s' %(baseurl,page0)
  pageref = "[[%s][page %s]]" %(url,page0)
  outarr.append('* TODO Case %04d: %s %s' % (nout, hw0,pageref))
   # output up to 10 lines of datalines
  outlines = datalines[0:10]
  for x in outlines:
   y = transcoder.transcoder_processString(x,'as','roman')
   outarr.append(';  %s' % y)
  if len(datalines)>10:
   ndiff = len(datalines) - 10
   outarr.append(';   [and %s more lines]' % ndiff)
  # 1 extra blank line
  outarr.append('')
  fout1.write('\n'.join(outarr) + "\n")
  if (nout == 25) and False:
   print "debug",nout
   break
   pass
 fout.close()
 fout1.close()
 print len(hwrecs),"headword records processed"
 print nout,"records written to ",fileout
 print nout,"sections written to ",fileout1
Beispiel #47
0
def adjust_slp1(x):
 # modfied to return both 
 outarr = [] # slp1
 parts = re.split(r'(<[^>]+>)|(\[Page.*?\])',x) # xml tags
 for part in parts: 
  if not part: #why needed? 
   pass 
  elif part.startswith('<') and part.endswith('>'):
   outarr.append(part)
  elif part.startswith('[Page') and part.endswith(']'):
   outarr.append(part)
  else: 
   # assume text in slp. Convert to slp1. Use specialized slp1_hk.xml
   y = transcoder.transcoder_processString(part,'slp1','hk')
   outarr.append(y)
 ans = ''.join(outarr)
 return ans
def gettestngrams(forThisBook,nth):
	result = set()
	padalist=['pada-1.1','pada-1.2','pada-1.3','pada-1.4','pada-2.1','pada-2.2','pada-2.3','pada-2.4','pada-3.1','pada-3.2','pada-3.3','pada-3.4','pada-4.1','pada-4.2','pada-4.3','pada-4.4','pada-5.1','pada-5.2','pada-5.3','pada-5.4','pada-6.1','pada-6.2','pada-6.3','pada-6.4','pada-7.1','pada-7.2','pada-7.3','pada-7.4','pada-8.1','pada-8.2','pada-8.3','pada-8.4']
	for pada in padalist:
		inputdir = '../../'+forThisBook+'/'+pada
		inputfiles = glob.glob(inputdir+'/*.*')
		print inputdir
		for inputfile in inputfiles:
			fin = codecs.open(inputfile,'r','utf-8')
			data = fin.read()
			text = data.split('---')[2].strip()
			text = transcoder.transcoder_processString(text,'deva','slp1')
			text = re.sub('[^a-zA-Z \']+','',text)
			result = result.union(getngrams(text.encode('utf-8'),nth))
			fin.close()
		print len(result), nth, 'gram'
	return result	
Beispiel #49
0
def adjust_hk_slp1(m):
 x1 = m.group(1)
 x2 = m.group(2)
 x3 = m.group(3)
 #partsin = re.split(r'(\[Page.*?\]|[\|.]+)',x2)
 partsin = re.split(r'(\[Page.*?\]|[.])',x2)  # Nov 5 - 2nd pass.
 partsout = [x1]
 for part in partsin:
  #if re.search(r'^(\[Page.*?\]|[\|.]+)$',part):
  if re.search(r'^(\[Page.*?\]|[.])$',part):
   partsout.append('#}%s{#' % part)
  else:
   partout = transcoder.transcoder_processString(part,'hk','slp1')
   partsout.append(partout)
 partsout.append(x3)
 out = ''.join(partsout)
 return out
def add_tags1(x):
    # 1 = SK number, 2 = sUtra, 3 = AS number
    m = re.search(u'{#([फि।उ]*[0-9]+)#}(.*){@([0-9-]+)@}', x)
    # sUtra (in Devanagari)
    sutra = m.group(2).strip()
    # Number (in Devanagari)
    num = transcoder.transcoder_processString(
        m.group(3).strip(), 'slp1', 'deva')
    """
	१.१.६९|अणुदित्सवर्णस्य चाप्रत्ययः|अणुदित्सवर्णस्य चाप्रत्ययः १.१.६९|१.१.६९ अणुदित्सवर्णस्य चाप्रत्ययः
	अणुदित्सवर्णस्य चाप्रत्ययः १.१.६९ <BR>
	"""
    result = '\n\n' + num + '|' + sutra + '|' + sutra + ' ' + num + '|' + num + ' ' + sutra + '\n' + sutra + ' ' + num + ' <BR> '
    # Change dash to period.
    result = result.replace('-', '.')
    # Remove unnecessary two line breaks before the first entry.
    result = result.replace(u'\n\n०.०.०', u'०.०.०')
    return result
Beispiel #51
0
def disp_org(icase,wordtype,hw0,url,page0,datalines):
 """ return array of lines, formatted for details of Emacs org mode
 """
 outarr=[]
 pageref = "[[%s][page %s]]" %(url,page0)
 outarr.append('* Case %04d: %s %s %s ' % (icase, wordtype,hw0,pageref))
  # output up to 10 lines of datalines
 outlines = datalines[0:10]
 for x in outlines:
  # Remove '|', which is a line-separator in BUR
  x = re.sub(r'[|]','',x)
  y = transcoder.transcoder_processString(x,'as','roman')
  outarr.append(';  %s' % y)
 if len(datalines)>10:
  ndiff = len(datalines) - 10
  outarr.append(';   [and %s more lines]' % ndiff)
 outarr.append('')
 return outarr
Beispiel #52
0
def convert(filein,fileout,tranin,tranout):
 fp = codecs.open(filein,"r",'utf-8')
 fpout = codecs.open(fileout,"w",'utf-8')
 n=0;
 for b in fp:
  exp = b.split("@")
  x = exp[0]
  exp[4] = exp[4].strip()
  x = x.rstrip('\r\n')
  y = x.lower()
  y = y[0].upper()+y[1:]
  if (y == ''):
   continue
  n=n+1
  z = transcoder.transcoder_processString(y,tranin,tranout)
  fpout.write("%s@%s@%s@%s@%s@%s\n" % (z,exp[0],exp[1],exp[2],exp[3],exp[4]))
 fp.close()
 fpout.close()
 print n,"lines converted to IAST and stored in abbrvoutput/sortedcrefsiast.txt\n"
Beispiel #53
0
def convert(filein, fileout, tranin, tranout):
    fp = codecs.open(filein, "r", 'utf-8')
    fpout = codecs.open(fileout, "w", 'utf-8')
    n = 0
    for b in fp:
        exp = b.split("@")
        x = exp[0]
        exp[4] = exp[4].strip()
        x = x.rstrip('\r\n')
        y = x.lower()
        y = y[0].upper() + y[1:]
        if (y == ''):
            continue
        n = n + 1
        z = transcoder.transcoder_processString(y, tranin, tranout)
        fpout.write("%s@%s@%s@%s@%s@%s\n" %
                    (z, exp[0], exp[1], exp[2], exp[3], exp[4]))
    fp.close()
    fpout.close()
    print n, "lines converted to IAST and stored in abbrvoutput/sortedcrefsiast.txt\n"
Beispiel #54
0
def adjust_hk_slp1(m):
 x1 = m.group(1)
 x2 = m.group(2)
 x3 = m.group(3)
 partsin = re.split(u'(ƒPage.*?ƒ)|([.])',x2)
 partsout = [x1]
 for part in partsin:
  #if re.search(u'^(ƒPage.*?ƒ)$',part):
  if not part:
   continue
  elif part.startswith(u'ƒ'): #re.search(u'^(ƒPage.*?ƒ)$',part):
   partsout.append('}%s#{' % part)
  elif re.search(r'^([.])$',part):
   partsout.append('}%s#{' % part)
  else:
   partout = transcoder.transcoder_processString(part,'hk','slp1')
   partsout.append(partout)
 partsout.append(x3)
 out = ''.join(partsout)
 return out
Beispiel #55
0
def postprocess(line):
    x = line.replace('&quot;', '`')
    m = re.search(
        '<div>([^<]*) <span class="sUtramIndex">, ([^<]*)</span> </div><p>', x)
    if m:
        rep = '---\nindex:  ' + transcoder.transcoder_processString(
            m.group(2), 'deva',
            'slp1') + '\nsutra:  ' + m.group(1) + '\nvritti:  nyasa\n---\n\n'
        x = re.sub(
            '<div>([^<]*) <span class="sUtramIndex">, ([^<]*)</span> </div><p>',
            rep, x)
    x = re.sub(
        '<span class="sUtramIndex"><a href="([0-9.]+)[.]htm">([^<]*)</a></span>',
        '(\g<1>)', x)
    x = x.replace('<span class="prashna">', '')
    x = x.replace('<span class="vArtikA">', '')
    x = re.sub('[<][^>]*[>]', '', x)
    x = x.strip()
    x += '\n'
    return x
Beispiel #56
0
def convert(filein, fileout, tranin, tranout):
    fp = codecs.open(filein, 'r', 'utf-8')
    fpout = codecs.open(fileout, 'w', 'utf-8')
    n = 0
    for x in fp:
        x = x.rstrip('\r\n')
        if (x == ''):
            continue
        n = n + 1
        m = re.search(r'^([^ ]+) (.+)$', x)
        if not m:
            out = 'line %s is unknown: %s' % (n, x)
            exit(1)
        head = m.group(1)
        body = m.group(2)
        body1 = transcoder.transcoder_processString(body, tranin, tranout)
        y = '%s %s' % (head, body1)
        fpout.write('%s\n' % y)
    fp.close()
    fpout.close()
    print n, 'lines converted\n'
Beispiel #57
0
def convert(filein, fileout, tranin, tranout):
    fp = codecs.open(filein, "r", 'utf-8')
    fpout = codecs.open(fileout, "w", 'utf-8')
    n = 0
    for x in fp:
        x = x.rstrip('\r\n')
        if (x == ''):
            continue
        n = n + 1
        m = re.search(r'^([^ ]+) (.+)$', x)
        if not m:
            out = "line %s is unknown: %s" % (n, x)
            exit(1)
        head = m.group(1)
        body = m.group(2)
        #body = re.sub('/\|/',' # ',body);
        #body = preg_replace('/ +/',' ',body);
        body1 = transcoder.transcoder_processString(body, tranin, tranout)
        y = "%s %s" % (head, body1)
        fpout.write("%s\n" % y)
    fp.close()
    fpout.close()
    print n, "lines converted\n"