コード例 #1
0
def LevenshteinDistance4Ku2SoDic(dicFile, output):
    import os
    import unicodedata
    import codecs
    import re

    import getVocabList

    outputlist = []

    currentVocabList = getVocabList.getKurmanjiDic(dicFile)

    # Format current dic to be ready for merging with the new vocabs
    outfile = codecs.open(output, 'w', encoding='utf-8')

    for node in currentVocabList:
        itemNo, Kurmanji, Sorani = node.split()
        outputlist.append(u'{: <6}{: <32}\t{: <4}'.format(
            levenshtein(Kurmanji, Sorani), Kurmanji, Sorani))

    outputlist.sort()

    i = 0
    for item in outputlist:
        i = i + 1
        w = u'{: <6}\t{: <32}\n'.format(i, item)
        outfile.write(w)

    outfile.close()
コード例 #2
0
def mergeKu2SoDic(SDic, KDic):
    import os
    import unicodedata
    import codecs
    import re

    import getVocabList

    updatedVocabList = []
    
    currentVocabList = getVocabList.getSoraniDic(SDic)
    Ku2SoDic = getVocabList.getKurmanjiDic(KDic)
    
    seen = set() # This set is used to check the duplicates
    # Format current dic to be ready for merging with the new vocabs
    for node in currentVocabList:
        itemNo, Sorani, Kurmanji = node.split()
        seen.add(Sorani) # There shouldn't be duplicates in the curretn dic
        updatedVocabList.append(u'{: <25}\t{: <4}'.format(Sorani, Kurmanji))
        
    # Merge the vocabs from the Kurmanji-to-Sorani wich do not exist
    # in the current Sorani-to-Kurmanji dictionary
    for node in Ku2SoDic:
        itemNo, Kurmanji, Sorani = node.split()
        if Sorani not in seen:
            seen.add(Sorani)
            updatedVocabList.append(u'{: <25}\t{: <4}'.format(Sorani, Kurmanji))
           

    updatedVocabList.sort()


    try:
        outfile = codecs.open(SDic, 'w', encoding = 'utf-8')

        i = 0
        for item in updatedVocabList:
            i = i + 1
            w = u'{: <6}\t{: <25}\n'.format(i, item)
            outfile.write(w)

        outfile.close()
    except:
        return # Leave with no harm! (Perhaps I shoul put a proper message here.)
コード例 #3
0
def updateKu2SoDic(dicFile, newVocabList):
    import os
    import unicodedata
    import codecs
    import re

    import getVocabList

    updatedVocabList = []
    seen = set()

    currentVocabList = getVocabList.getKurmanjiDic(dicFile)

    # Format current dic to be ready for merging with the new vocabs
    for node in currentVocabList:
        itemNo, Kurmanji, Sorani = node.split()
        updatedVocabList.append(u'{: <25}\t{: <4}'.format(Kurmanji, Sorani))
        seen.add(Kurmanji.lower())

    # Merge the new vocabs with the existing dictionary
    for node in newVocabList:
        Kurmanji, Sorani = node.split()
        if Kurmanji.lower() not in seen:
            updatedVocabList.append(u'{: <25}\t{: <4}'.format(
                Kurmanji, Sorani))

    updatedVocabList.sort()

    try:
        outfile = codecs.open(dicFile, 'w', encoding='utf-8')

        i = 0
        for item in updatedVocabList:
            i = i + 1
            w = u'{: <6}\t{: <25}\n'.format(i, item)
            outfile.write(w)

        outfile.close()
    except:
        return  # Leave with no harm! (Perhaps I shoul put a proper message here.)
コード例 #4
0
def Kurmanji2SoraniLiterTran(inputFile, outputFile):
    import os
    import unicodedata
    import codecs
    import re

    import getVocabList

    infile = codecs.open(inputFile, 'r', encoding='utf-8')
    inputText = infile.read()
    infile.close()

    tregex = re.compile(ur'[$""!£"%$' '&:`)(.,?/\'\r\n]', re.IGNORECASE)
    sanitizedText1 = tregex.sub(
        ' ', inputText)  # remove special and non-aplha chars
    sanitizedText = re.sub(' +', ' ',
                           sanitizedText1)  # remove excessive spaces

    words = re.split(r' ', sanitizedText)
    numberOfWords = len(words)

    translatedText = []
    targetDic = getVocabList.getKurmanjiDic(
        '/home/hosseinhassani/Hossein/w4u/KurmanjiSoraniDicNew')
    equivalent = False
    newVocab = [
    ]  # The vocabulary that were not found in the target dictionary

    for w in words:
        for node in targetDic:
            itemNo, Kurmanji, Sorani = node.split()
            # The following conversion are necessary.
            # I have to find out the reason, as without these conversions,
            # the comparsion of the words does not work properly.
            w = u'{}'.format(w)
            Kurmanji = u'{}'.format(Kurmanji)
            Sorani = u'{}'.format(Sorani)
            if (w.lower() == Kurmanji):
                # The equivlent for some Kurmanji words have more than one token.
                # The tokens have been separated in the dictionary with dashes ('-').
                # A re has been used to replace these spearators with blank,
                # make the translated text more natural.
                translatedText.append(Sorani + ' ')
                equivalent = True
                break
            else:
                equivalent = False

        if equivalent <> True:
            translatedText.append(w + ' ')
            # if re.match('^[$""!£"%$''&:`)(.,?/\'\r\n]', w):
            # Only interested in alphabetic tokens with at lesst 2 characters
            # Check for duplication in the new vocabs
            if w.isalpha() and len(w) > 1:
                newVocab.append(w.lower())
            equivalent = True

    # Remove duplicates and just send a list of unique new vocabs
    seen = set()
    uniqueNewVocab = []
    for v in newVocab:
        if v not in seen:
            uniqueNewVocab.append(v)
            seen.add(v)

    outfile = codecs.open(outputFile, 'w', encoding='utf-8')

    for item in translatedText:
        outfile.write(item)

    outfile.close()

    return translatedText, uniqueNewVocab, numberOfWords