def compare(taggedCorpus, goldenCorpus): """ To calculate the the performance on POS tagging """ outputTokens = open(taggedCorpus, "r").read().split() standardTokens = open(goldenCorpus, "r").read().split() if len(outputTokens) != len(standardTokens): print ("The numbers of tokens are not equal!") return 0 numwords = 0 count = 0 for i in (range(len(outputTokens))): numwords += 1 word1, tag1 = getWordTag(outputTokens[i]) word2, tag2 = getWordTag(standardTokens[i]) if word1 != word2: print ("Data not equal in position", i) print (outputTokens[i], standardTokens[i-1], standardTokens[i], standardTokens[i+1]) return 0 if tag1.lower() == tag2.lower(): count += 1 #else: # print outputTokens[i-1], outputTokens[i], outputTokens[i+1], "<=>", standardTokens[i-1], standardTokens[i], standardTokens[i+1] return count * 100 / float(len(outputTokens))
def tagRawSentence(self, DICT, rawLine): line = VnInitTagger4Sentence(DICT, rawLine) sen = '' wordTags = line.replace("“", "''").replace("”", "''").replace("\"", "''").split() for i in (range(len(wordTags))): fwObject = FWObject.getFWObject(wordTags, i) word, tag = getWordTag(wordTags[i]) node = self.findFiredNode(fwObject) sen += word + "/" + node.conclusion + " " return sen.strip()
def tagRawSentence(self, DICT, rawLine): line = InitTagger4Sentence(DICT, rawLine) sen = "" wordTags = line.replace("“", "''").replace("”", "''").replace('"', "''").split() for i in range(len(wordTags)): fwObject = FWObject.getFWObject(wordTags, i) word, tag = getWordTag(wordTags[i]) node = self.findFiredNode(fwObject) if node.depth > 0: sen += word + "/" + node.conclusion + " " else: # Fired at root, return initialized tag sen += word + "/" + tag + " " return sen.strip()
def createLexicon(corpusFile, outDictName, fullLexicon): """ Generate a dictionary from a golden corpus 'corpusFile': Output is a file consisting of lines in which each of them contains a word and the most frequent associated tag corpusFile: path to the golden training corpus outDictName: file name of the dictionary/lexicon fullLexicon: gets True or False value. If it is False, the output lexicon does not contain 1 time occurrence words """ if fullLexicon not in ['True', 'False']: print( "the third parameter gets \"True\" or \"False\" string-value!!!") return lines = open(corpusFile, "r").readlines() wordTagCounter = {} for i in (range(len(lines))): #print i pairs = lines[i].strip().split() for pair in pairs: word, tag = getWordTag(pair) if word not in wordTagCounter: wordTagCounter[word] = {} wordTagCounter[word][tag] = 1 else: if tag not in wordTagCounter[word]: wordTagCounter[word][tag] = 1 else: wordTagCounter[word][tag] += 1 # Get the most frequent tag associated to each word from operator import itemgetter dictionary = {} tagCounter = {} for word in wordTagCounter: tagFreqDic = wordTagCounter[word] pairs = tagFreqDic.items() pairs.sort(key = itemgetter(1), reverse=True) mostFreqTag = pairs[0][0] if fullLexicon == 'True': # Get the full lexicon including 1 time occurrence words dictionary[word] = mostFreqTag else: # Get the lexicon without 1 time occurrence words if (len(pairs) == 1 and pairs[0][1] > 1) or len(pairs) > 1: dictionary[word] = mostFreqTag if mostFreqTag not in tagCounter: tagCounter[mostFreqTag] = 1 else: tagCounter[mostFreqTag] += 1 from collections import OrderedDict dictionary = OrderedDict(sorted(dictionary.iteritems(), key=itemgetter(0))) # Get the most frequent tag in the lexicon to label unknown words tagCounter = OrderedDict(sorted(tagCounter.iteritems(), key=itemgetter(1), reverse=True)) defaultTag = tagCounter.keys()[0] #Write to file fileOut = open(outDictName, "w") fileOut.write("DefaultTag " + defaultTag + "\n") for key in dictionary: fileOut.write(key + " " + dictionary[key] + "\n") fileOut.close() return dictionary