forked from drdhaval2785/inriaxmlwrapper
/
sanskritmark.py
534 lines (503 loc) · 29.7 KB
/
sanskritmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
# This Python file uses the following encoding: utf-8
# Author - Dr. Dhaval patel - drdhaval2785@gmail.com - www.sanskritworld.in
# XML database of verbs taken from sanskrit.inria.fr site of Gerard Huet. For sample, please see SL_roots.xml
# Date - 23 October 2015
# Version - 1.0.1
from lxml import etree
from io import StringIO, BytesIO
import re
import transcoder
import codecs
import datetime
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
# Function to return timestamp
def timestamp():
return datetime.datetime.now()
# Parsing the XMLs. We will use them as globals when need be.
print "Parsing of XMLs started at", timestamp()
roots = etree.parse(dir_path+'/SL_roots.xml') # parses the XML file.
nouns = etree.parse(dir_path+'/SL_nouns.xml')
adverbs = etree.parse(dir_path+'/SL_adverbs.xml')
final = etree.parse(dir_path+'/SL_final.xml')
parts = etree.parse(dir_path+'/SL_parts.xml')
pronouns = etree.parse(dir_path+'/SL_pronouns.xml')
upasargas = etree.parse(dir_path+'/SL_upasargas.xml')
# This filelist can include all or some files. By default it takes into account all XMLs of Gerard.
# If you need some specific database like roots, nouns etc you can keep them and remove the rest. It would speed up the process.
filelist = [roots, nouns, adverbs, final, parts, pronouns, upasargas]
#filelist = [parts]
# Precreate set for search
formlist = []
for l in filelist:
formlist.extend([member.get('form') for member in l.xpath('/forms/f')])
formset=set(formlist)
print "{} forms cached for quick search".format(len(formset))
print "Parsing of XMLs completed at", timestamp()
# Function for form check - for splitting etc.
def quicksearch(form):
global formset
return form in formset
#print "Will notify after every 100 words analysed."
# function to create the verbformlist from XML file SL_roots.xml. Output stored as verbformlist.txt.
# The database can be put into many uses e.g. in https://github.com/drdhaval2785/SanskritVerb/ I use it to compare my output against Gerard's database and note the errors in my output for correction.
def verbformlist():
global roots # fetched parsed XML.
g = codecs.open('verbformlist.txt', 'w', 'utf-8') # Opened the file to store data.
# Typical line in XML file is in `<forms><f form="aMSaya"><v><cj><prim/></cj><sys><prs gn="11"><md><ip/></md><para/></prs></sys><np><sg/><snd/></np></v><s stem="aMSa"/></f></forms>` format.
# Therefore, the data which we want e.g. 'aMSaya' is in /forms/f-get('form') location. Fetching it below.
roo = roots.xpath('/forms/f') # Returns a list of all /forms/f in the database.
verbformlist = [member.get('form') for member in roo] # For all /forms/f in database, we get its attribute 'forms' e.g. 'aMSaya'.
g.write(','.join(verbformlist)) # The list verbformlist is stored as comma separated data in the outputfile.
g.close() # Closed the file.
#verbformlist()
# function to create verblist from XML file SL_roots.xml.
def verblist():
global roots # Fetched parsed XML.
g = codecs.open('verblist.txt', 'w', 'utf-8') # Opened the file to store data.
# Typical line in XML file is in `<forms><f form="aMSaya"><v><cj><prim/></cj><sys><prs gn="11"><md><ip/></md><para/></prs></sys><np><sg/><snd/></np></v><s stem="aMSa"/></f></forms>` format.
# Therefore, the data which we want e.g. 'aMSa' is in /forms/f/s-get('stem') location. Fetching it below.
roo = roots.xpath('/forms/f/s') # Returns a list of all /forms/f/s in the database.
verbformlist = [member.get('stem') for member in roo] # For all /forms/f/s in the databae, we get its attribute 'stem' e.g. 'aMSa'.
verbformlist = list(set(verbformlist)) # Keeping only unique entries, because for a stem can give a lot of verb forms.
outputlist = [] # Initialising a list to store data for return.
for member in verbformlist:
member = re.sub(r'[0-9]',r'',member) # Removed all digits, because Gerard names its verbs like 'BU#1' sometimes. If you want such output, comment these lines out.
member = re.sub('#','',member) # Removed '#'.
outputlist.append(member) # Added in the outputlist.
outputlist = list(set(outputlist)) # Keeping only unique items.
print len(outputlist) # Just to get an idea of the number of verbs returned. Can comment out if not needed.
g.write(','.join(outputlist)) # The list is stored as comma separated data in the outputfile.
g.close() # Closed the file.
#verblist();
# Returns first members of a compound. Gerard stores them as iic, iip and iiv tags, in final.xml file.
def firstmemberlist():
global final # Calling global variable final
# defining xpaths
iic = final.xpath('/forms/f/iic')
iip = final.xpath('/forms/f/iip')
iiv = final.xpath('/forms/f/iiv')
# fetched and added data
firstmemberlist = [member.getparent().get('form') for member in iic]
firstmemberlist += [member.getparent().get('form') for member in iip]
firstmemberlist += [member.getparent().get('form') for member in iiv]
return firstmemberlist
# Storing firstmembers for future use as global variable.
firstmembers = firstmemberlist()
# Secondmember of a compound. Gerard has stored them in noun and participles files.
def secondmemberlist():
# Calling global variables nouns and parts
global nouns, parts
n = nouns.xpath('/forms/f')
p = parts.xpath('/forms/f')
# Storing data
secondmemberlist = [member.get('form') for member in n]
secondmemberlist += [member.get('form') for member in p]
return secondmemberlist
# Storing secondmembers for future use as global variable.
secondmembers = secondmemberlist()
# Returns all the forms for XML files listed in variable 'filelist'.
def allmemberlist():
global filelist # If you need to increase or decrease the database, change the filelist variable in the starting of the code.
allmemberlist = [] # Initialising a list to store data.
for file in filelist:
allwords = file.xpath('/forms/f')
allmemberlist += [member.get('form') for member in allwords] # Created an array of all forms in filelist.
return allmemberlist # Returns the list.
# Storing allmembers for future use as global variable.
allmembers = allmemberlist()
#print "Starting analysis at", timestamp()
# Function makestring converts a list of list into all possible strings e.g. [['p','b'],['A'],['g','u']] would give pAg,bAg,pAu,bAu as output.
# Default joiner is a blank. You can specify conj='+' etc to join the strings with a '+'.
# The function is used as subroutine in function 'sss'.
def makestring(listoflist, conj=''):
# by default the least number of members of the output list would be equal to the first member e.g. len(['p','b']) = 2.
outputstore = listoflist[0]
# For rest of the members, we run a for loop.
for i in range(1,len(listoflist)):
out2 = []
for member in outputstore:
# All strings of the listoflist[i] are joined to the member (the strings generated so far). e.g. pA,bA would become pAg,bAg,pAu,gAu in our example
for j in listoflist[i]:
out2.append(member + conj + j)
outputstore = out2
out3 = []
for love in outputstore:
out3.append(love)
return out3
# This function splits a long word into its constituent words (If you want to change the dictionary, change the parameter 'words'. By default, they are fetched data from Gerard's XMLs above)
# http://stackoverflow.com/questions/8870261/how-to-split-text-without-spaces-into-list-of-words
#def find_words(instring, prefix = '', words = firstmembers + secondmembers):
def find_words(instring, prefix = '', words = allmembers):
if not instring:
return [] # Return a blank list
if (not prefix) and (instring in words):
return [instring] # The instring is in our 'words' i.e. the input is in our dictionary. Therefore, there is no need to break it. It would be returned.
prefix, suffix = prefix + instring[0], instring[1:]
solutions = []
# Case 1: prefix in solution
if prefix in words:
try:
solutions.append([prefix] + find_words(suffix, '', words)) # A recursive function. This is very slow for long words. Will have to find out some more efficient way.
except ValueError:
pass
# Case 2: prefix not in solution
try:
solutions.append(find_words(suffix, prefix, words)) # A recursive function.
except ValueError:
pass
if solutions:
return sorted(solutions,
key = lambda solution: [len(word) for word in solution],
reverse = True)[0] # Returns the split with maximum possibility.
else:
return 'error'
# A subroutine to main function 'sss' to remove impossible splits i.e. when any split is less than expectedlength or when the terminal split is <= 4.
def leng(x, expectedlength, wordlist=allmembers):
output = []
y = x.split('+')
for mem in y:
if len(mem) <= expectedlength: # if there is any member in the split which is less than specified length, there are low chances that the split is good, so removing such splits.
return False
else:
if y[-1] <= 4: # The last member of the split is an inflected form. So it is very rare to see them less than 4 letters. Excluding them.
return False
elif re.search(r'[aAiIuUfFeEoO][aAiIuUfFeEoO]', x): # Sanskrit has very less chances of two consecutive vowels. Excluding them.
return False
else: # Returning the rest as true.
return True
# A subroutine to main function 'sss' to remove splits of find_words which don't parse the full word i.e. which didn't get the correct parse.
def find_word_exact(inputword):
findwordoutput = find_words(inputword)
if ''.join(findwordoutput) == inputword: # If the split matches with the inputword
return '+'.join(findwordoutput) # Return the split with '+' in between
else:
return 'error' # Return error.
# sss stands for sandhisamasasplitting. It works well, but too slow. We need some restrictions while making sandhi splits.
def sss(inputword):
# Some premodifications in the inputwords, because Gerard stores them with terminal s or m.
if len(inputword) > 1:
if inputword[-1] == 'H':
inputword = inputword[:-1]+"s" # A word ending with a visarga are converted to sakArAnta.
elif inputword[-1] == 'M':
inputword = inputword[:-1]+"m" # A word ending with an anusvAra is converted to makArAnta.
global firstmembers, secondmembers
# A rough sandhi data for vowel sandhi. It can be extended to consonant sandhis and visarga sandhis, rutva sandhis etc by the same logic. Pending as of now.
voweldata = [('A','a+a'),('A','a+A'),('A','A+a'),('A','A+A'),('I','i+i'),('I','i+I'),('I','I+i'),('I','I+I'),('U','u+u'),('U','u+U'),('U','U+u'),('U','U+U'),('F','f+f'),('e','e+a'),('e','a+i'),('e','A+i'),('e','a+I'),('e','A+I'),
('o','o+a'),('o','a+u'),('o','A+u'),('o','a+U'),('o','A+U'),('E','a+e'),('E','A+e'),('E','a+E'),('E','A+E'),('O','a+o'),('O','A+o'),('O','a+O'),('O','A+O'),('y','i+'),('y','I+'),('v','u+'),('v','U+'),]
consonantdata = [(r'([aAiIuUfFxXeEoO])([M])([S])([c])','\1n\4')]
out2 = []
for i in xrange(len(inputword)): # For each member of the list inputword
out3 = [inputword[i]]
for (x,y) in voweldata: # For each tuple in list voweldata
out3.append(inputword[i].replace(x, y)) # Replaced the vowels with their possible splits e.g. A->a+a,a+A,A+a,A+A.
out3 = list(set(out3)) # Unique entries.
out2.append(out3)
sandhisplitdata = makestring(out2) # Returns the list in form of string.
out4 = []
for san in sandhisplitdata:
if leng(san,2) or san.split('+')[-1] in ['ca']:
#print timestamp()
san = re.sub('[+]', '', san) # Removed '+' sign.
for (a,b) in consonantdata:
san = re.sub(a, b, san) # Replaced consonantdata with their corresponding changes.
#print timestamp()
q = find_word_exact(san) # Return the splits which are proper splits of the inputword, filtering out unnecessary splits.
#print timestamp()
if q is not 'error':
out4.append(q) # store in out4 list.
if len(out4) > 0 and type(out4) == list:
return '|'.join(out4) # Return the split separated by '|' in case there are more than one possible splits.
else:
return 'error' # Return error.
# function findwordform searches in the XML file for line which matches the wordform we are interested in. e.g. findwordform("Bavati","SL_roots.xml") would find all lines of XML file which have word form "Bavati".
def findwordform(inputform):
#print "importing filelist for findwordform started at", timestamp()
global filelist # Fetched global variable filelist.
#print "importing filelist for findwordform ended at", timestamp()
outputlist = [] # A list to store output.
for file in filelist:
tree = file # The etree to manipulate.
xpathname = '/forms/f[@form="' + inputform + '"]' # Defined the xpath to search
#print "xpath parsing for findwordform started at", timestamp() # After speed analysis, this function is the most time expensive. It takes around 0.10 sec per word form.
r = tree.xpath(xpathname) # Created a list 'r' whose members are lines of XML file which correspond to the word form 'inputform'
#print "xpath parsing for findwordform ended at", timestamp()
for member in r:
outputlist.append(etree.tostring(member).strip()) # Created a string out of element tree. strip() removes unnecessary white spaces around the string.
#print "findwordform completed at", timestamp()
if len(outputlist) == 0:
return '????' # Error message.
else:
return "|".join(outputlist) # Else, return the list separated by '|'.
# function converttodevanagari is used to translate the short forms used by Gerard Huet to their Sanskrit Devanagari counterpart.
def converttodevanagari(attributeslist):
# Abbreviations used by Gerard.
gerardwords = ['nom', 'acc', 'ins', 'dat', 'abl', 'gen', 'loc', 'voc', 'mas', 'fem', 'neu', 'dei', 'sg', 'du', 'pl', 'fst', 'snd', 'trd', 'iic', 'iiv', 'iip', 'avyaya', 'na', 'uf', 'conj']
# Their counterparts in Sanskrit
devawords = ['प्रथमा', 'द्वितीया', 'तृतीया', 'चतुर्थी', 'पञ्चमी', 'षष्ठी', 'सप्तमी', 'संबोधन', 'पुंल्लिङ्ग', 'स्त्रीलिङ्ग', 'नपुंसकलिङ्ग', 'सङ्ख्या', 'एकवचन', 'द्विवचन', 'बहुवचन', 'प्रथमपुरुष', 'मध्यमपुरुष', 'उत्तमपुरुष', 'समासपूर्वपद', 'सहायकधातुपूर्व', 'कृदन्तपूर्वपद', 'अव्यय', '', 'अव्यय', '']
outputlist = [] # initiated a blank list.
for member in attributeslist:
alist = [] # initiated a blank list.
for mem1 in member:
alist.append(devawords[gerardwords.index(mem1)]) # appended the Devanagari words for Gerard's abbreviations.
outputlist.append(alist) # appended alist to outputlist
return outputlist # returned outputlist which has everything converted to Sanskrit in Devanagari.
# function iter is to iterate over the XML file and get following from a given word form - Base root and attributes
# Default strength is "Full". "deva" converts the output to Devanagari, which is not advisable to use for any other use than testing.
def iter(wordxml, strength="Full"):
if wordxml == "????":
return "????" # Error message
else:
wordxml = unicode(wordxml) # Converted the word to unicode
wordwithtags = [] # Empty list
individualentries = wordxml.split('|')
for individualentry in individualentries:
tree = StringIO(individualentry) # Created XML from the worddata
#print "parsing of iter started at", timestamp()
context = etree.parse(tree) # Parsed the element tree.
#print "parsing of iter ended at", timestamp()
root = context.getroot() # got the root of element tree e.g. 'f'
# The next two steps require explanation. In Gerard's XML files, All possible attributes are given as children of 'f'. The last child is always 's' which stores the stem. All other children are the various possible word attributes. Given as 'na' or 'v' etc. Gio
children = root.getchildren()[:-1] # attributes
basedata = root.getchildren()[-1] # 's' stem
basewordslp = basedata.get('stem').strip() # Base word in SLP1 encoding.
if strength == "deva":
baseword = transcoder.transcoder_processString(basewordslp,'slp1','deva') # If the user wants output in Devanagari rather than SLP1, this code converts it to Devanagari.
else:
baseword = basewordslp # Otherwise in SLP1.
attributes = [] # An empty list to store attributes.
for child in children:
taglist = child.xpath('.//*') # Fetches all elements (abbreviations) of a particular verb / word characteristics.
output = [child.tag] # The first member of output list is the tag of element 'v', 'na' etc.
output = output + [ tagitem.tag for tagitem in taglist] # Other tags (abbreviations) and add it to output list.
# The following section is commented out right now. But it would be needed for situation where we need to konw the gaNa of a verb or 7 kinds of aorist derivation.
"""if len(child.xpath('.//prs[@gn]')) > 0:
prsgana = child.xpath('.//prs')[0].get('gn')
output.append('verbgana')
output.append(prsgana)
elif len(child.xpath('.//aor[@gn]')) > 0:
aorgana = child.xpath('.//aor')[0].get('gn')
output.append('aoristgana')
output.append(aorgana)
elif len(child.xpath('.//inj[@gn]')) > 0:
injgana = child.xpath('.//inj')[0].get('gn')
output.append('injunctivegana')
output.append(injgana)"""
attributes.append(output) # output list is appended to attributes list.
if (strength == "deva"):
outputlist = converttodevanagari(attributes) # Devanagari
else:
outputlist = attributes # SLP1
for member in outputlist:
wordwithtags.append(baseword + "-" + "-".join(member) ) # Created a list wordwithtags where the first member is baseword and the rest of the members are attributes separated by '-'
#print "postprocessing of iter ended at", timestamp()
return "|".join(wordwithtags) # If there are more than one possible verb characteristics for a given form, they are shown separated by a '|'
def mapiter(input):
return iter(findwordform(input))
# function analyser analyses all the XML files and gets matching details from all XMLs e.g. 'Bavati' may be a verb form, but it can also be a noun form of 'Bavat' locative singular. Therefore it is needed to traverse all XML files.
def analyser(word, strength="Full",split=True):
global secondmembers
if not findwordform(word) == '????': # If the output is not an error
return iter(findwordform(word), strength) # Return the output
elif split:
samasa = sss(word) # Try to split the word for samAsa / sandhi.
output = []
if samasa is not 'error': # If the word can be split as a samAsa / sandhi,
indsamasa = samasa.split('|') # Separate all possible splits.
for indsam in indsamasa:
indsamcomponents = indsam.split('+') # Separate members of the compound / sandhi
analy = map(mapiter, indsamcomponents) # Apply mapiter function to all components of indsamcomponents.
prefix = indsamcomponents[:-1]
lastword = indsamcomponents[-1]
lastwordanalysed = iter(findwordform(lastword), strength)
for lastwordan in lastwordanalysed.split('|'):
output.append('+'.join(prefix) + '+' + lastwordan)
output.append('$'.join(analy)) # Appended the solution to output list.
return '%'.join(output) # Return the output joined by '%'.
else:
return '????' # Return error.
# Don't know ther reason, but findrootword and generator are taking too long. They used to work well earlier.
# Functions findrootword and generator are for generating the word form from given attributes and root.
def findrootword(checkedrootword):
listing = [] # Initialised a list.
filelist = ['SL_roots.xml', 'SL_nouns.xml', 'SL_adverbs.xml', 'SL_final.xml', 'SL_parts.xml', 'SL_pronouns.xml'] # list of Gerard's xml files.
for datafile in filelist:
tree = etree.parse(datafile) # Parsed a tree.
entries = tree.xpath('.//f') # All entries of that file.
for entry in entries:
parts = entry.getchildren() # Got all the children of that tree.
s = parts[-1] # Fetched tag 's'.
if s.get('stem') == checkedrootword: # If the stem is the same as checkedrootword
children = parts[:-1] # Removed the last because it has only stem data.
for child in children:
taglist = child.xpath('.//*')
output = [ child.tag ] + [ tagitem.tag for tagitem in taglist]
if output[-1] in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] and output[-2] in ['verbgana', 'aoristgana', 'injunctivegana']: # Remove the last two data, because they are not part of Gerard's scheme.
output = output[:-2]
output += [ entry.get('form') ] # Added the entered word form at the last.
listing.append(output) # Added output to listing list
return listing # Return listing list.
def generator(analysedword, translit="slp1"):
analysedword = unicode(analysedword) # unicode
data = re.split('|',analysedword) # There may be cases where the data may have been analysed by our analyser. They would be separated by '|'.
for datum in data:
separate = re.split('-', datum) # split the whole string by '-'
rootword = separate[0] # Base word
taglist = separate[1:] # attributes
if taglist[-1] in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] and taglist[-2] in ['verbgana', 'aoristgana', 'injunctivegana']:
taglist = taglist[:-2] # Removed artificially added attributes
datahavingroot = findrootword(rootword) # Created a list of possible items
outlist = []
for rootdatum in datahavingroot:
if set(taglist) < set(rootdatum): # If the tags supplied are a subset of the data from XML file,
outlist.append(rootdatum[-1]) # Add the word form to outlist
if translit == "deva":
return transcoder.transcoder_processString("|".join(outlist),'slp1','deva') # Devanagari
else:
return "|".join(outlist) # SLP1
#print generator('Davala-sg-mas-abl', 'deva')
# devangaridisplay and translator functions are created for Nripendra Pathak, so that he may provide necessary data for extending the code.
# function devanagaridisplay will show the attribute list from XML files in a format which a traditional Sanskrit scholar may understand easily.
def devanagaridisplay(word,split=True,remove_hash=False):
if len(word) > 1:
if word[-1] == 'H':
word = word[:-1]+"s" # A word ending with a visarga are converted to sakArAnta, because this is how Gerard has stored his data.
elif word[-1] == 'M':
word = word[:-1]+"m"
# If there are tags which are not enumerated here, they can be added as and when there is a need.
database = [('v-cj-prim', 'प्राथमिकः'),
('v-cj-ca', 'णिजन्तः'),
('v-cj-int', 'यङन्तः'),
('v-cj-des', 'सन्नन्तः'),
('sys-prs-md-pr', 'लट्'),
('sys-prs-md-ip', 'लोट्'),
('sys-prs-md-op', 'विधिलिङ्'),
('sys-prs-md-im', 'लङ्'),
('sys-pas-md-pr', 'लट्-कर्मणि'),
('sys-pas-md-ip', 'लोट्-कर्मणि'),
('sys-pas-md-op', 'विधिलिङ्-कर्मणि'),
('sys-pas-md-im', 'लङ्-कर्मणि'),
('sys-tp-fut', 'लृट्'),
('sys-tp-prf', 'लिट्'),
('sys-tp-aor', 'लुङ्'),
('sys-tp-inj', 'आगमाभावयुक्तलुङ्'),
('sys-tp-cnd', 'लृङ्'),
('sys-tp-ben', 'आशीर्लिङ्'),
('sys-pef', 'लुट्'),
('np-sg', 'एकवचनम्'),
('np-du', 'द्विवचनम्'),
('np-pl', 'बहुवचनम्'),
('fst', 'उत्तमपुरुषः'),
('snd', 'मध्यमपुरुषः'),
('trd', 'प्रथमपुरुषः'),
('na-nom', 'प्रथमाविभक्तिः'),
('na-voc', 'संबोधनविभक्तिः'),
('na-acc', 'द्वितीयाविभक्तिः'),
('na-ins', 'तृतीयाविभक्तिः'),
('na-dat', 'चतुर्थीविभक्तिः'),
('na-abl', 'पञ्चमीविभक्तिः'),
('na-gen', 'षष्ठीविभक्तिः'),
('na-loc', 'सप्तमीविभक्तिः'),
('sg', 'एकवचनम्'),
('du', 'द्विवचनम्'),
('pl', 'बहुवचनम्'),
('mas', 'पुंल्लिङ्गम्'),
('fem', 'स्त्रीलिङ्गम्'),
('neu', 'नपुंसकलिङ्गम्'),
('dei', 'सङ्ख्या'),
('uf', 'अव्ययम्'),
('ind', 'क्रियाविशेषणम्'),
('interj', 'उद्गारः'),
('parti', 'निपातम्'),
('prep', 'चादिः'),
('conj', 'संयोजकः'),
('tasil', 'तसिल्'),
('vu-cj-prim', 'अव्ययधातुरूप-प्राथमिकः'),
('vu-cj-ca', 'अव्ययधातुरूप-णिजन्तः'),
('vu-cj-int', 'अव्ययधातुरूप-यङन्तः'),
('vu-cj-des', 'अव्ययधातुरूप-सन्नन्तः'),
('iv-inf','तुमुन्'),
('iv-abs','क्त्वा'),
('iv-per','per'),
('ab-cj-prim', 'क्त्वा-प्राथमिकः'),
('ab-cj-ca', 'क्त्वा-णिजन्तः'),
('ab-cj-int', 'क्त्वा-यङन्तः'),
('ab-cj-des', 'क्त्वा-सन्नन्तः'),
('kr-cj-prim-no', 'प्राथमिकः'),
('kr-cj-ca-no', 'णिजन्तः'),
('kr-cj-int-no', 'यङन्तः'),
('kr-cj-des-no', 'सन्नन्तः'),
('kr-vb-no', ''),
('ppp', 'कर्मणिभूतकृदन्तः'),
('ppa', 'कर्तरिभूतकृदन्तः'),
('pprp', 'कर्मणिवर्तमानकृदन्तः'),
('ppr-para', 'कर्तरिवर्तमानकृदन्त-परस्मैपदी'),
('ppr-atma', 'कर्तरिवर्तमानकृदन्त-आत्मनेपदी'),
('ppft-para', 'पूर्णभूतकृदन्त-परस्मैपदी'),
('ppft-atma', 'पूर्णभूतकृदन्त-आत्मनेपदी'),
('pfutp', 'कर्मणिभविष्यत्कृदन्तः'),
('pfut-para', 'कर्तरिभविष्यत्कृदन्त-परस्मैपदी'),
('pfut-atma', 'कर्तरिभविष्यत्कृदन्त-आत्मनेपदी'),
('gya', 'य'),
('iya', 'ईय'),
('tav', 'तव्य'),
('para', 'कर्तरि'),
('atma', 'कर्तरि'),
('pass', 'कर्मणि'),
('pa', 'कृदन्तः'),
('iic', 'समासपूर्वपदनामपदम्'),
('iip', 'समासपूर्वपदकृदन्तः'),
('iiv', 'समासपूर्वपदधातुः'),
('upsrg', 'उपसर्गः')
]
#print "analysis of word started", timestamp()
datafetched = analyser(word,split=split) # Analyse the input word.
#print "analysis of word ended", timestamp()
if datafetched == "????": # If error
return "????" # Return error
else:
individual = datafetched.split("|") # create a list of all possible splits.
outputlist = []
for ind in individual:
split = ind.split('-') # Separate the tags.
root = split[0].decode('utf-8') # Base root.
if remove_hash:
root = root.split('#')[0] # In case you want to remove '#1' etc kept by Gerard, uncomment it.
root = transcoder.transcoder_processString(root, "slp1", "deva") # Conversion to Devanagari.
output = "-".join(split[1:]) # All Devanagari attributs joined with '-'.
output = output.decode('utf-8') # UTF-8
for member in database:
output = re.sub(member[0], member[1].decode('utf-8'), output) # Changed attributes strings with their Sanskrit Devanagari counterparts.
output = transcoder.transcoder_processString(output, "slp1", "deva") # This code creates some issue in windows setting. Therefore not converting to Devanagari right now. Will do that later.
outputlist.append(root + "-" + output)
#print "Postprocessing of word ended", timestamp()
return "|".join(outputlist) # Return output
#print devanagaridisplay("aMSakatas")
# This is the most important function.
# It analyses the words in inputfile and gives the analysed version in outputfile.
def convertfromfile(inputfile,outputfile):
f = codecs.open(inputfile, 'r', 'utf-8') # Opened inputfile with UTF-8 encoding.
data = f.readlines() # Read the lines into a list.
f.close() # Closed the inputfile.
g = codecs.open(outputfile, 'w', 'utf-8') # Opened the outputfile with UTF-8 encoding.
for datum1 in data: # For each member of data,
datum1 = datum1.strip() # Removed unnecessary whitespaces.
datum1 = transcoder.transcoder_processString(datum1, "deva", "slp1") # Converted from Devanagari to SLP1.
dat = re.split('(\W+)',datum1) # Created a word list by exploding the sentence at word boundaries.
for i in xrange(len(dat)):
datum = dat[i].strip() # Clean whitespaces.
if i % 2 == 0 and i != len(dat)-1: # Even members of datum are the words and odd members are word boundaries. Therefore, processing only even members.
#print "analysis of word started", timestamp()
x = devanagaridisplay(datum) # Analysed the even members.
#print "analysis of word ended", timestamp()
g.write(transcoder.transcoder_processString(datum, "slp1", "deva")+"("+x+")") # Wrote to the outputfile.
print datum, timestamp()
#print transcoder.transcoder_processString(datum, "slp1", "deva")+"("+x+")" # printed to the screen for the user.
#print "wrote to the file", timestamp()
else:
g.write(transcoder.transcoder_processString(dat[i], "slp1", "deva")) # For odd members, converted the word boundaries to their Devanagari counterparts.
g.write('\n') # Newline character added
print # Newline character printed on terminal.
g.close() # Closed outputfile.
if __name__=="__main__":
convertfromfile('sanskritinput.txt','analysedoutput.txt')