Ejemplo n.º 1
0
def compare(taggedCorpus, goldenCorpus):
    """
    To calculate the the performance on POS tagging
    """
    outputTokens = open(taggedCorpus, "r").read().split()
    standardTokens = open(goldenCorpus, "r").read().split()
    if len(outputTokens) != len(standardTokens):
        print ("The numbers of tokens are not equal!")
        return 0
    numwords = 0
    count = 0
    
    for i in (range(len(outputTokens))):
        numwords += 1
        word1, tag1 = getWordTag(outputTokens[i])
        word2, tag2 = getWordTag(standardTokens[i])        
        if word1 != word2:
            print ("Data not equal in position", i)
            print (outputTokens[i], standardTokens[i-1], standardTokens[i], standardTokens[i+1])
            return 0
        if tag1.lower() == tag2.lower():
            count += 1
        #else:
        #   print outputTokens[i-1], outputTokens[i], outputTokens[i+1], "<=>", standardTokens[i-1], standardTokens[i], standardTokens[i+1]
    return count * 100 / float(len(outputTokens))
Ejemplo n.º 2
0
 def tagRawSentence(self, DICT, rawLine):
     line = VnInitTagger4Sentence(DICT, rawLine)
     sen = ''
     wordTags = line.replace("“", "''").replace("”", "''").replace("\"", "''").split()
     for i in (range(len(wordTags))):
         fwObject = FWObject.getFWObject(wordTags, i)
         word, tag = getWordTag(wordTags[i])
         node = self.findFiredNode(fwObject)
         sen += word + "/" + node.conclusion + " "
     return sen.strip()
Ejemplo n.º 3
0
 def tagRawSentence(self, DICT, rawLine):
     line = InitTagger4Sentence(DICT, rawLine)
     sen = ""
     wordTags = line.replace("“", "''").replace("”", "''").replace('"', "''").split()
     for i in range(len(wordTags)):
         fwObject = FWObject.getFWObject(wordTags, i)
         word, tag = getWordTag(wordTags[i])
         node = self.findFiredNode(fwObject)
         if node.depth > 0:
             sen += word + "/" + node.conclusion + " "
         else:  # Fired at root, return initialized tag
             sen += word + "/" + tag + " "
     return sen.strip()
Ejemplo n.º 4
0
def createLexicon(corpusFile, outDictName, fullLexicon):
    """
    Generate a dictionary from a golden corpus 'corpusFile':
    Output is a file consisting of lines in which each of them contains a word and the most frequent associated tag
    corpusFile: path to the golden training corpus
    outDictName: file name of the dictionary/lexicon
    fullLexicon: gets True or False value. If it is False, the output lexicon does not contain 1 time occurrence words
    """
    if fullLexicon not in ['True', 'False']:
        print( "the third parameter gets \"True\" or \"False\" string-value!!!")
        return
    
    lines = open(corpusFile, "r").readlines()
    wordTagCounter = {}
    for i in (range(len(lines))):
        #print i
        pairs = lines[i].strip().split()
        for pair in pairs:
            word, tag = getWordTag(pair)
            
            if word not in wordTagCounter:
                wordTagCounter[word] = {}
                wordTagCounter[word][tag] = 1
            else:
                if tag not in wordTagCounter[word]:
                    wordTagCounter[word][tag] = 1
                else:
                    wordTagCounter[word][tag] += 1
    
    # Get the most frequent tag associated to each word    
    from operator import itemgetter
    dictionary = {}
    tagCounter = {}
    for word in wordTagCounter:
        tagFreqDic = wordTagCounter[word]
        pairs = tagFreqDic.items()
        pairs.sort(key = itemgetter(1), reverse=True)
        mostFreqTag = pairs[0][0]
        
        if fullLexicon == 'True': # Get the full lexicon including 1 time occurrence words
            dictionary[word] = mostFreqTag
        else: # Get the lexicon without 1 time occurrence words
            if (len(pairs) == 1 and  pairs[0][1] > 1) or len(pairs) > 1:
                dictionary[word] = mostFreqTag
        
        if mostFreqTag not in tagCounter:
            tagCounter[mostFreqTag] = 1
        else:
            tagCounter[mostFreqTag] += 1
        
    from collections import OrderedDict
    dictionary = OrderedDict(sorted(dictionary.iteritems(), key=itemgetter(0)))
    
    # Get the most frequent tag in the lexicon to label unknown words
    tagCounter = OrderedDict(sorted(tagCounter.iteritems(), key=itemgetter(1), reverse=True))
    defaultTag = tagCounter.keys()[0]
    
    #Write to file
    fileOut = open(outDictName, "w")
    fileOut.write("DefaultTag " + defaultTag + "\n")
    for key in dictionary:
        fileOut.write(key + " " + dictionary[key] + "\n")
    
    fileOut.close()
    
    return dictionary