Beispiel #1
0
def compare(taggedCorpus, goldenCorpus):
    """
    To calculate the the performance on POS tagging
    """
    outputTokens = open(taggedCorpus, "r").read().split()
    standardTokens = open(goldenCorpus, "r").read().split()
    if len(outputTokens) != len(standardTokens):
        print "The numbers of tokens are not equal!"
        return 0
    numwords = 0
    count = 0
    for i in xrange(len(outputTokens)):
        numwords += 1
        word1, tag1 = getWordTag(outputTokens[i])
        word2, tag2 = getWordTag(standardTokens[i])
        if word1 != word2:
            print "Data not equal in position", i
            print outputTokens[i], standardTokens[
                i - 1], standardTokens[i], standardTokens[i + 1]
            return 0
        if tag1.lower() == tag2.lower():
            count += 1
        #else:
        #   print outputTokens[i-1], outputTokens[i], outputTokens[i+1], "<=>", standardTokens[i-1], standardTokens[i], standardTokens[i+1]
    return count * 100 / float(len(outputTokens))
Beispiel #2
0
def computeAccuracies(fullDictFile, goldStandardCorpus, taggedCorpus):
    """
    Return known-word accuracy, unknown-word accuracy and the overall accuracy  
    """
    tagged = open(taggedCorpus, "r").read().split()
    goldStandard = open(goldStandardCorpus, "r").read().split()
    if len(tagged) != len(goldStandard):
        print("The numbers of word tokens in %s and %s are not equal!" %
              (goldStandardCorpus, taggedCorpus))
        return 0

    fullDICT = readDictionary(fullDictFile)

    numwords = count = 0
    countKN = countUNKN = 0
    countCorrectKN = countCorrectUNKN = 0

    for i in range(len(tagged)):
        numwords += 1
        word1, tag1 = getWordTag(tagged[i])
        word2, tag2 = getWordTag(goldStandard[i])
        if word1 != word2 and word1 != "''" and word2 != "''":
            print(
                "Words are not the same in gold standard and tagged corpora, at the index "
                + str(i))
            return 0

        if tag1.lower() == tag2.lower():
            count += 1

        if word1 in fullDICT:
            countKN += 1
            if tag1.lower() == tag2.lower():
                countCorrectKN += 1
        else:
            countUNKN += 1
            if tag1.lower() == tag2.lower():
                countCorrectUNKN += 1

    if countUNKN == 0:
        return countCorrectKN * 100.0 / countKN, 0.0, count * 100.0 / numwords
    else:
        return countCorrectKN * 100.0 / countKN, countCorrectUNKN * 100.0 / countUNKN, count * 100.0 / numwords
Beispiel #3
0
def computeAccuracy(goldStandardCorpus, taggedCorpus):
    tagged = open(taggedCorpus, "r").read().split()
    goldStandard = open(goldStandardCorpus, "r").read().split()
    if len(tagged) != len(goldStandard):
        print "The numbers of word tokens in %s and %s are not equal!" % (goldStandardCorpus, taggedCorpus)
        return 0
    numwords = 0
    count = 0
    for i in xrange(len(tagged)):
        numwords += 1
        word1, tag1 = getWordTag(tagged[i])
        word2, tag2 = getWordTag(goldStandard[i])  
        if word1 != word2 and  word1 != "''" and word2 != "''":
            print "Words are not the same in gold standard and tagged corpora, at the index", i
            return 0
              
        if tag1.lower() == tag2.lower():
            count += 1
    
    return count * 100.0 / numwords
Beispiel #4
0
def computeAccuracy(goldStandardCorpus, taggedCorpus):
    tagged = open(taggedCorpus, "r").read().split()
    goldStandard = open(goldStandardCorpus, "r").read().split()
    if len(tagged) != len(goldStandard):
        print "The numbers of word tokens in %s and %s are not equal!" % (
            goldStandardCorpus, taggedCorpus)
        return 0
    numwords = 0
    count = 0
    for i in range(len(tagged)):
        numwords += 1
        word1, tag1 = getWordTag(tagged[i])
        word2, tag2 = getWordTag(goldStandard[i])
        if word1 != word2 and word1 != "''" and word2 != "''":
            print "Words are not the same in gold standard and tagged corpora, at the index", i
            return 0

        if tag1.lower() == tag2.lower():
            count += 1

    return count * 100.0 / numwords
Beispiel #5
0
def computeAccuracies(fullDictFile, goldStandardCorpus, taggedCorpus):
    """
    Return known-word accuracy, unknown-word accuracy and the overall accuracy  
    """
    tagged = open(taggedCorpus, "r").read().split()
    goldStandard = open(goldStandardCorpus, "r").read().split()
    if len(tagged) != len(goldStandard):
        print "The numbers of word tokens in %s and %s are not equal!" % (goldStandardCorpus, taggedCorpus)
        return 0
    
    fullDICT = readDictionary(fullDictFile)
    
    numwords = count = 0
    countKN = countUNKN = 0
    countCorrectKN = countCorrectUNKN = 0
    
    for i in xrange(len(tagged)):
        numwords += 1
        word1, tag1 = getWordTag(tagged[i])
        word2, tag2 = getWordTag(goldStandard[i])   
        if word1 != word2 and  word1 != "''" and word2 != "''":
            print "Words are not the same in gold standard and tagged corpora, at the index", i
            return 0
             
        if tag1.lower() == tag2.lower():
            count += 1
        
        if word1 in fullDICT:
            countKN += 1
            if tag1.lower() == tag2.lower():
                countCorrectKN += 1
        else:
            countUNKN += 1
            if tag1.lower() == tag2.lower():
                countCorrectUNKN += 1
        
    if countUNKN == 0:
        return countCorrectKN * 100.0 / countKN, 0.0, count * 100.0 / numwords
    else:
        return countCorrectKN * 100.0 / countKN, countCorrectUNKN * 100.0 / countUNKN, count * 100.0 / numwords
 def tagRawVnSentence(self, DICT, rawLine):
     line = initializeVnSentence(DICT, rawLine)
     sen = []
     wordTags = line.split()
     for i in range(len(wordTags)):
         fwObject = FWObject.getFWObject(wordTags, i)
         word, tag = getWordTag(wordTags[i])
         node = self.findFiredNode(fwObject)
         if node.depth > 0:
             sen.append(word + "/" + node.conclusion)
         else:  # Fired at root, return initialized tag
             sen.append(word + "/" + tag)
     return " ".join(sen)
 def tagRawEnSentence(self, DICT, rawLine):
     line = initializeEnSentence(DICT, rawLine)
     sen = []
     wordTags = line.split()
     for i in xrange(len(wordTags)):
         fwObject = FWObject.getFWObject(wordTags, i)
         word, tag = getWordTag(wordTags[i])
         node = self.findFiredNode(fwObject)
         if node.depth > 0:
             sen.append(word + "/" + node.conclusion)
         else:# Fired at root, return initialized tag
             sen.append(word + "/" + tag)
     return " ".join(sen)
Beispiel #8
0
def compare(taggedCorpus, goldenCorpus):
    """
    To calculate the the performance on POS tagging
    """
    outputTokens = open(taggedCorpus, "r").read().split()
    standardTokens = open(goldenCorpus, "r").read().split()
    if len(outputTokens) != len(standardTokens):
        print "The numbers of tokens are not equal!"
        return 0
    numwords = 0
    count = 0
    for i in xrange(len(outputTokens)):
        numwords += 1
        word1, tag1 = getWordTag(outputTokens[i])
        word2, tag2 = getWordTag(standardTokens[i])        
        if word1 != word2:
            print "Data not equal in position", i
            print outputTokens[i], standardTokens[i-1], standardTokens[i], standardTokens[i+1]
            return 0
        if tag1.lower() == tag2.lower():
            count += 1
        #else:
        #   print outputTokens[i-1], outputTokens[i], outputTokens[i+1], "<=>", standardTokens[i-1], standardTokens[i], standardTokens[i+1]
    return count * 100 / float(len(outputTokens))
Beispiel #9
0
    def tagRawSentence(self, DICT, rawLine):
        # set DICT to "self"to use a preloaded dictionary
        if DICT == "self":
            DICT = self.DICT

        line = initializeSentence(DICT, rawLine)
        sen = []
        wordTags = line.split()
        for i in range(len(wordTags)):
            fwObject = FWObject.getFWObject(wordTags, i)
            word, tag = getWordTag(wordTags[i])
            node = self.findFiredNode(fwObject)
            if node.depth > 0:
                sen.append(word + "/" + node.conclusion)
            else:  # Fired at root, return initialized tag
                sen.append(word + "/" + tag)
        return " ".join(sen)
def createLexicon(corpusFilePath, fullLexicon):
    if fullLexicon not in ['full', 'short']:
        print "The second parameter gets 'full' or 'short' string-value!"
        print "No lexicon is generated!!!"
        return
    #elif fullLexicon == 'full':
    #    print "Creating a full .DICT lexicon from the gold standard training corpus", corpusFilePath
    #else:
    #    print "Creating a short .sDict lexicon which excludes word types appearing 1 time in the gold standard training corpus"
    
    lines = open(corpusFilePath, "r").readlines()
    wordTagCounter = {}
    for i in xrange(len(lines)):
        # print i
        pairs = lines[i].strip().replace("“", "''").replace("”", "''").replace("\"", "''").split()
        for pair in pairs:
            word, tag = getWordTag(pair)
            if (len(word) >= (len(pair) - 1)) or (len(tag) >= (len(pair) - 1)):
                print "Incorrectly formatted " + str(i+1) + "th sentence at:", pair
            else:
                add2WordTagFreqDict(word, tag, wordTagCounter)
    
    from operator import itemgetter
    dictionary = {}
    suffixDictCounter = {}
    
    tagCounter_Alphabet = {}
    tagCounter_CapitalizedWord = {}
    tagCounter_Numeric = {}
    
    for word in wordTagCounter:
        tagFreq4Word = wordTagCounter[word]
        pairs = tagFreq4Word.items()
        pairs.sort(key = itemgetter(1), reverse = True)
        tag = pairs[0][0]
        
        decodedWord = word.decode("utf-8")
        isCapital = decodedWord[0].isupper()
              
        if fullLexicon == 'full':
            dictionary[word] = tag
        else:# Get the lexicon without 1-time-occurrence word types
            if (len(pairs) == 1 and  pairs[0][1] > 1) or len(pairs) > 1:
                dictionary[word] = tag
                
        if re.search(r"[0-9]+", word) != None:
            if tag not in tagCounter_Numeric:
                tagCounter_Numeric[tag] = 1
            else:
                tagCounter_Numeric[tag] += 1
        else:
            if isCapital:
                if tag not in tagCounter_CapitalizedWord:
                    tagCounter_CapitalizedWord[tag] = 1
                else:
                    tagCounter_CapitalizedWord[tag] += 1
            else:
                if tag not in tagCounter_Alphabet:
                    tagCounter_Alphabet[tag] = 1
                else:
                    tagCounter_Alphabet[tag] += 1
        
            if len(decodedWord) >= 4:
                suffix = ".*" + decodedWord[-3:]
                add2WordTagFreqDict(suffix.encode("utf-8"), tag, suffixDictCounter)
                suffix = ".*" + decodedWord[-2:]
                add2WordTagFreqDict(suffix.encode("utf-8"), tag, suffixDictCounter)
            if len(decodedWord) >= 5:
                suffix = ".*" + decodedWord[-4:]
                add2WordTagFreqDict(suffix.encode("utf-8"), tag, suffixDictCounter)
            if len(decodedWord) >= 6:
                suffix = ".*" + decodedWord[-5:]
                add2WordTagFreqDict(suffix.encode("utf-8"), tag, suffixDictCounter)
        
    from collections import OrderedDict
    dictionary = OrderedDict(sorted(dictionary.iteritems(), key = itemgetter(0)))
    
    # Get the most frequent tag in the lexicon to label unknown words and numbers
    tagCounter_Alphabet = OrderedDict(sorted(tagCounter_Alphabet.iteritems(), key = itemgetter(1), reverse = True))
    tagCounter_CapitalizedWord = OrderedDict(sorted(tagCounter_CapitalizedWord.iteritems(), key = itemgetter(1), reverse = True))
    tagCounter_Numeric = OrderedDict(sorted(tagCounter_Numeric.iteritems(), key = itemgetter(1), reverse = True))
    tag4UnknWord = tagCounter_Alphabet.keys()[0]
    tag4UnknCapitalizedWord = tag4UnknWord
    tag4UnknNum = tag4UnknWord
    if len(tagCounter_CapitalizedWord) > 0:
        tag4UnknCapitalizedWord = tagCounter_CapitalizedWord.keys()[0]
    if len(tagCounter_Numeric) > 0:
        tag4UnknNum = tagCounter_Numeric.keys()[0]
    
    # Write to file
    fileSuffix = ".sDict"
    if fullLexicon == 'full':
        fileSuffix = ".DICT"
    fileOut = open(corpusFilePath + fileSuffix, "w")
    
    
    fileOut.write("TAG4UNKN-WORD " + tag4UnknWord + "\n")
    fileOut.write("TAG4UNKN-CAPITAL " + tag4UnknCapitalizedWord + "\n")
    fileOut.write("TAG4UNKN-NUM " + tag4UnknNum + "\n")
    for key in dictionary:
        fileOut.write(key + " " + dictionary[key] + "\n")
    
    for suffix in suffixDictCounter:
        tagFreq4Suffix = suffixDictCounter[suffix]
        pairs = tagFreq4Suffix.items()
        pairs.sort(key = itemgetter(1), reverse = True)
        tag = pairs[0][0]
        freq = pairs[0][1]
        if len(suffix) == 7 and freq >= 2:
            fileOut.write(suffix + " " + tag + "\n")
        if len(suffix) == 6 and freq >= 3:
            fileOut.write(suffix + " " + tag + "\n")
        if len(suffix) == 5 and freq >= 4:
            fileOut.write(suffix + " " + tag + "\n")
        if len(suffix) == 4 and freq >= 5:
            fileOut.write(suffix + " " + tag + "\n")
            
    fileOut.close()
def createLexicon(corpusFile, outDictName, fullLexicon):
    """
    Generate a dictionary from a golden corpus 'corpusFile':
    Output is a file consisting of lines in which each of them contains a word and the most frequent associated tag
    corpusFile: path to the golden training corpus
    outDictName: file name of the dictionary/lexicon
    fullLexicon: gets True or False value. If it is False, the output lexicon does not contain 1 time occurrence words
    """
    if fullLexicon not in ['True', 'False']:
        print "the third parameter gets \"True\" or \"False\" string-value!!!"
        return
    
    lines = open(corpusFile, "r").readlines()
    tagCounter = {}
    dic = {}
    for i in xrange(len(lines)):
        #print i
        pairs = lines[i].strip().split()
        for pair in pairs:
            word, tag = getWordTag(pair)
            
            if word not in dic:
                dic[word] = {}
                dic[word][tag] = 1
            else:
                if tag not in dic[word]:
                    dic[word][tag] = 1
                else:
                    dic[word][tag] = dic[word][tag] + 1
            
            if tag not in tagCounter:
                tagCounter[tag] = 1
            else:
                tagCounter[tag] = tagCounter[tag] + 1
    
    # Get the most frequent tag associated to each word    
    from operator import itemgetter
    dictionary = {}
    for word in dic:
        tagFreqDic = dic[word]
        if len(tagFreqDic.keys()) == 1:
            if fullLexicon == 'True': # Get the full lexicon including 1 time occurrence words
                dictionary[word] = tagFreqDic.keys()[0]
            else: # Get the lexicon without 1 time occurrence words
                if tagFreqDic.values()[0] > 1:
                    dictionary[word] = tagFreqDic.keys()[0]
        else:
            pairs = tagFreqDic.items()
            pairs.sort(key = itemgetter(1), reverse=True)
            dictionary[word] = pairs[0][0]
    
    from collections import OrderedDict
    dictionary = OrderedDict(sorted(dictionary.iteritems(), key=itemgetter(0)))
    
    # Get the most frequent tag in the training corpus
    pairs = tagCounter.items()
    pairs.sort(key = itemgetter(1), reverse=True)
    mostFreqTag = pairs[0][0]
    
    #Write to file
    fileOut = open(outDictName, "w")
    fileOut.write("DefaultTag " + mostFreqTag + "\n")
    for key in dictionary:
        fileOut.write(key + " " + dictionary[key] + "\n")
    
    fileOut.close()
    
    return dictionary
 def get_word_tag(self, word):
     return getWordTag(word)
Beispiel #13
0
def createLexicon(corpusFilePath, fullLexicon):
    if fullLexicon not in ['full', 'short']:
        print "The second parameter gets 'full' or 'short' string-value!"
        print "No lexicon is generated!!!"
        return
    #elif fullLexicon == 'full':
    #    print "Creating a full .DICT lexicon from the gold standard training corpus", corpusFilePath
    #else:
    #    print "Creating a short .sDict lexicon which excludes word types appearing 1 time in the gold standard training corpus"
    
    lines = open(corpusFilePath, "r").readlines()
    wordTagCounter = {}
    for i in xrange(len(lines)):
        # print i
        pairs = lines[i].strip().replace("“", "''").replace("”", "''").replace("\"", "''").split()
        for pair in pairs:
            word, tag = getWordTag(pair)
            if (len(word) >= (len(pair) - 1)) or (len(tag) >= (len(pair) - 1)):
                print "Incorrectly formatted " + str(i+1) + "th sentence at:", pair
            else:
                add2WordTagFreqDict(word, tag, wordTagCounter)
    
    from operator import itemgetter
    dictionary = {}
    suffixDictCounter = {}
    
    tagCounter_Alphabet = {}
    tagCounter_CapitalizedWord = {}
    tagCounter_Numeric = {}
    
    for word in wordTagCounter:
        tagFreq4Word = wordTagCounter[word]
        pairs = tagFreq4Word.items()
        pairs.sort(key = itemgetter(1), reverse = True)
        tag = pairs[0][0]
        
        decodedWord = word.decode("utf-8")
        isCapital = decodedWord[0].isupper()
              
        if fullLexicon == 'full':
            dictionary[word] = tag
        else:# Get the lexicon without 1-time-occurrence word types
            if (len(pairs) == 1 and  pairs[0][1] > 1) or len(pairs) > 1:
                dictionary[word] = tag
                
        if re.search(r"[0-9]+", word) != None:
            if tag not in tagCounter_Numeric:
                tagCounter_Numeric[tag] = 1
            else:
                tagCounter_Numeric[tag] += 1
        else:
            if isCapital:
                if tag not in tagCounter_CapitalizedWord:
                    tagCounter_CapitalizedWord[tag] = 1
                else:
                    tagCounter_CapitalizedWord[tag] += 1
            else:
                if tag not in tagCounter_Alphabet:
                    tagCounter_Alphabet[tag] = 1
                else:
                    tagCounter_Alphabet[tag] += 1
        
            if len(decodedWord) >= 4:
                suffix = ".*" + decodedWord[-3:]
                add2WordTagFreqDict(suffix.encode("utf-8"), tag, suffixDictCounter)
                suffix = ".*" + decodedWord[-2:]
                add2WordTagFreqDict(suffix.encode("utf-8"), tag, suffixDictCounter)
            if len(decodedWord) >= 5:
                suffix = ".*" + decodedWord[-4:]
                add2WordTagFreqDict(suffix.encode("utf-8"), tag, suffixDictCounter)
            if len(decodedWord) >= 6:
                suffix = ".*" + decodedWord[-5:]
                add2WordTagFreqDict(suffix.encode("utf-8"), tag, suffixDictCounter)
        
    from collections import OrderedDict
    dictionary = OrderedDict(sorted(dictionary.iteritems(), key = itemgetter(0)))
    
    # Get the most frequent tag in the lexicon to label unknown words and numbers
    tagCounter_Alphabet = OrderedDict(sorted(tagCounter_Alphabet.iteritems(), key = itemgetter(1), reverse = True))
    tagCounter_CapitalizedWord = OrderedDict(sorted(tagCounter_CapitalizedWord.iteritems(), key = itemgetter(1), reverse = True))
    tagCounter_Numeric = OrderedDict(sorted(tagCounter_Numeric.iteritems(), key = itemgetter(1), reverse = True))
    tag4UnknWord = tagCounter_Alphabet.keys()[0]
    tag4UnknCapitalizedWord = tag4UnknWord
    tag4UnknNum = tag4UnknWord
    if len(tagCounter_CapitalizedWord) > 0:
        tag4UnknCapitalizedWord = tagCounter_CapitalizedWord.keys()[0]
    if len(tagCounter_Numeric) > 0:
        tag4UnknNum = tagCounter_Numeric.keys()[0]
    
    # Write to file
    fileSuffix = ".sDict"
    if fullLexicon == 'full':
        fileSuffix = ".DICT"
    fileOut = open(corpusFilePath + fileSuffix, "w")
    
    
    fileOut.write("TAG4UNKN-WORD " + tag4UnknWord + "\n")
    fileOut.write("TAG4UNKN-CAPITAL " + tag4UnknCapitalizedWord + "\n")
    fileOut.write("TAG4UNKN-NUM " + tag4UnknNum + "\n")
    for key in dictionary:
        fileOut.write(key + " " + dictionary[key] + "\n")
    
    for suffix in suffixDictCounter:
        tagFreq4Suffix = suffixDictCounter[suffix]
        pairs = tagFreq4Suffix.items()
        pairs.sort(key = itemgetter(1), reverse = True)
        tag = pairs[0][0]
        freq = pairs[0][1]
        if len(suffix) == 7 and freq >= 2:
            fileOut.write(suffix + " " + tag + "\n")
        if len(suffix) == 6 and freq >= 3:
            fileOut.write(suffix + " " + tag + "\n")
        if len(suffix) == 5 and freq >= 4:
            fileOut.write(suffix + " " + tag + "\n")
        if len(suffix) == 4 and freq >= 5:
            fileOut.write(suffix + " " + tag + "\n")
            
    fileOut.close()
def createLexicon(corpusFile, outDictName, fullLexicon):
    """
    Generate a dictionary from a golden corpus 'corpusFile':
    Output is a file consisting of lines in which each of them contains a word and the most frequent associated tag
    corpusFile: path to the golden training corpus
    outDictName: file name of the dictionary/lexicon
    fullLexicon: gets True or False value. If it is False, the output lexicon does not contain 1 time occurrence words
    """
    if fullLexicon not in ['True', 'False']:
        print "the third parameter gets \"True\" or \"False\" string-value!!!"
        return
    
    lines = open(corpusFile, "r").readlines()
    tagCounter = {}
    dic = {}
    for i in xrange(len(lines)):
        #print i
        pairs = lines[i].strip().split()
        for pair in pairs:
            word, tag = getWordTag(pair)
            
            if word not in dic:
                dic[word] = {}
                dic[word][tag] = 1
            else:
                if tag not in dic[word]:
                    dic[word][tag] = 1
                else:
                    dic[word][tag] = dic[word][tag] + 1
            
            if tag not in tagCounter:
                tagCounter[tag] = 1
            else:
                tagCounter[tag] = tagCounter[tag] + 1
    
    # Get the most frequent tag associated to each word    
    from operator import itemgetter
    dictionary = {}
    for word in dic:
        tagFreqDic = dic[word]
        if len(tagFreqDic.keys()) == 1:
            if fullLexicon == 'True': # Get the full lexicon including 1 time occurrence words
                dictionary[word] = tagFreqDic.keys()[0]
            else: # Get the lexicon without 1 time occurrence words
                if tagFreqDic.values()[0] > 1:
                    dictionary[word] = tagFreqDic.keys()[0]
        else:
            pairs = tagFreqDic.items()
            pairs.sort(key = itemgetter(1), reverse=True)
            dictionary[word] = pairs[0][0]
    
    from collections import OrderedDict
    dictionary = OrderedDict(sorted(dictionary.iteritems(), key=itemgetter(0)))
    
    # Get the most frequent tag in the training corpus
    pairs = tagCounter.items()
    pairs.sort(key = itemgetter(1), reverse=True)
    mostFreqTag = pairs[0][0]
    
    #Write to file
    fileOut = open(outDictName, "w")
    fileOut.write("DefaultTag " + mostFreqTag + "\n")
    for key in dictionary:
        fileOut.write(key + " " + dictionary[key] + "\n")
    
    fileOut.close()
    
    return dictionary