def LevenshteinDistance4Ku2SoDic(dicFile, output): import os import unicodedata import codecs import re import getVocabList outputlist = [] currentVocabList = getVocabList.getKurmanjiDic(dicFile) # Format current dic to be ready for merging with the new vocabs outfile = codecs.open(output, 'w', encoding='utf-8') for node in currentVocabList: itemNo, Kurmanji, Sorani = node.split() outputlist.append(u'{: <6}{: <32}\t{: <4}'.format( levenshtein(Kurmanji, Sorani), Kurmanji, Sorani)) outputlist.sort() i = 0 for item in outputlist: i = i + 1 w = u'{: <6}\t{: <32}\n'.format(i, item) outfile.write(w) outfile.close()
def mergeKu2SoDic(SDic, KDic): import os import unicodedata import codecs import re import getVocabList updatedVocabList = [] currentVocabList = getVocabList.getSoraniDic(SDic) Ku2SoDic = getVocabList.getKurmanjiDic(KDic) seen = set() # This set is used to check the duplicates # Format current dic to be ready for merging with the new vocabs for node in currentVocabList: itemNo, Sorani, Kurmanji = node.split() seen.add(Sorani) # There shouldn't be duplicates in the curretn dic updatedVocabList.append(u'{: <25}\t{: <4}'.format(Sorani, Kurmanji)) # Merge the vocabs from the Kurmanji-to-Sorani wich do not exist # in the current Sorani-to-Kurmanji dictionary for node in Ku2SoDic: itemNo, Kurmanji, Sorani = node.split() if Sorani not in seen: seen.add(Sorani) updatedVocabList.append(u'{: <25}\t{: <4}'.format(Sorani, Kurmanji)) updatedVocabList.sort() try: outfile = codecs.open(SDic, 'w', encoding = 'utf-8') i = 0 for item in updatedVocabList: i = i + 1 w = u'{: <6}\t{: <25}\n'.format(i, item) outfile.write(w) outfile.close() except: return # Leave with no harm! (Perhaps I shoul put a proper message here.)
def updateKu2SoDic(dicFile, newVocabList): import os import unicodedata import codecs import re import getVocabList updatedVocabList = [] seen = set() currentVocabList = getVocabList.getKurmanjiDic(dicFile) # Format current dic to be ready for merging with the new vocabs for node in currentVocabList: itemNo, Kurmanji, Sorani = node.split() updatedVocabList.append(u'{: <25}\t{: <4}'.format(Kurmanji, Sorani)) seen.add(Kurmanji.lower()) # Merge the new vocabs with the existing dictionary for node in newVocabList: Kurmanji, Sorani = node.split() if Kurmanji.lower() not in seen: updatedVocabList.append(u'{: <25}\t{: <4}'.format( Kurmanji, Sorani)) updatedVocabList.sort() try: outfile = codecs.open(dicFile, 'w', encoding='utf-8') i = 0 for item in updatedVocabList: i = i + 1 w = u'{: <6}\t{: <25}\n'.format(i, item) outfile.write(w) outfile.close() except: return # Leave with no harm! (Perhaps I shoul put a proper message here.)
def Kurmanji2SoraniLiterTran(inputFile, outputFile): import os import unicodedata import codecs import re import getVocabList infile = codecs.open(inputFile, 'r', encoding='utf-8') inputText = infile.read() infile.close() tregex = re.compile(ur'[$""!£"%$' '&:`)(.,?/\'\r\n]', re.IGNORECASE) sanitizedText1 = tregex.sub( ' ', inputText) # remove special and non-aplha chars sanitizedText = re.sub(' +', ' ', sanitizedText1) # remove excessive spaces words = re.split(r' ', sanitizedText) numberOfWords = len(words) translatedText = [] targetDic = getVocabList.getKurmanjiDic( '/home/hosseinhassani/Hossein/w4u/KurmanjiSoraniDicNew') equivalent = False newVocab = [ ] # The vocabulary that were not found in the target dictionary for w in words: for node in targetDic: itemNo, Kurmanji, Sorani = node.split() # The following conversion are necessary. # I have to find out the reason, as without these conversions, # the comparsion of the words does not work properly. w = u'{}'.format(w) Kurmanji = u'{}'.format(Kurmanji) Sorani = u'{}'.format(Sorani) if (w.lower() == Kurmanji): # The equivlent for some Kurmanji words have more than one token. # The tokens have been separated in the dictionary with dashes ('-'). # A re has been used to replace these spearators with blank, # make the translated text more natural. translatedText.append(Sorani + ' ') equivalent = True break else: equivalent = False if equivalent <> True: translatedText.append(w + ' ') # if re.match('^[$""!£"%$''&:`)(.,?/\'\r\n]', w): # Only interested in alphabetic tokens with at lesst 2 characters # Check for duplication in the new vocabs if w.isalpha() and len(w) > 1: newVocab.append(w.lower()) equivalent = True # Remove duplicates and just send a list of unique new vocabs seen = set() uniqueNewVocab = [] for v in newVocab: if v not in seen: uniqueNewVocab.append(v) seen.add(v) outfile = codecs.open(outputFile, 'w', encoding='utf-8') for item in translatedText: outfile.write(item) outfile.close() return translatedText, uniqueNewVocab, numberOfWords