def compare(taggedCorpus, goldenCorpus): """ To calculate the the performance on POS tagging """ outputTokens = open(taggedCorpus, "r").read().split() standardTokens = open(goldenCorpus, "r").read().split() if len(outputTokens) != len(standardTokens): print "The numbers of tokens are not equal!" return 0 numwords = 0 count = 0 for i in xrange(len(outputTokens)): numwords += 1 word1, tag1 = getWordTag(outputTokens[i]) word2, tag2 = getWordTag(standardTokens[i]) if word1 != word2: print "Data not equal in position", i print outputTokens[i], standardTokens[ i - 1], standardTokens[i], standardTokens[i + 1] return 0 if tag1.lower() == tag2.lower(): count += 1 #else: # print outputTokens[i-1], outputTokens[i], outputTokens[i+1], "<=>", standardTokens[i-1], standardTokens[i], standardTokens[i+1] return count * 100 / float(len(outputTokens))
def computeAccuracies(fullDictFile, goldStandardCorpus, taggedCorpus): """ Return known-word accuracy, unknown-word accuracy and the overall accuracy """ tagged = open(taggedCorpus, "r").read().split() goldStandard = open(goldStandardCorpus, "r").read().split() if len(tagged) != len(goldStandard): print("The numbers of word tokens in %s and %s are not equal!" % (goldStandardCorpus, taggedCorpus)) return 0 fullDICT = readDictionary(fullDictFile) numwords = count = 0 countKN = countUNKN = 0 countCorrectKN = countCorrectUNKN = 0 for i in range(len(tagged)): numwords += 1 word1, tag1 = getWordTag(tagged[i]) word2, tag2 = getWordTag(goldStandard[i]) if word1 != word2 and word1 != "''" and word2 != "''": print( "Words are not the same in gold standard and tagged corpora, at the index " + str(i)) return 0 if tag1.lower() == tag2.lower(): count += 1 if word1 in fullDICT: countKN += 1 if tag1.lower() == tag2.lower(): countCorrectKN += 1 else: countUNKN += 1 if tag1.lower() == tag2.lower(): countCorrectUNKN += 1 if countUNKN == 0: return countCorrectKN * 100.0 / countKN, 0.0, count * 100.0 / numwords else: return countCorrectKN * 100.0 / countKN, countCorrectUNKN * 100.0 / countUNKN, count * 100.0 / numwords
def computeAccuracy(goldStandardCorpus, taggedCorpus): tagged = open(taggedCorpus, "r").read().split() goldStandard = open(goldStandardCorpus, "r").read().split() if len(tagged) != len(goldStandard): print "The numbers of word tokens in %s and %s are not equal!" % (goldStandardCorpus, taggedCorpus) return 0 numwords = 0 count = 0 for i in xrange(len(tagged)): numwords += 1 word1, tag1 = getWordTag(tagged[i]) word2, tag2 = getWordTag(goldStandard[i]) if word1 != word2 and word1 != "''" and word2 != "''": print "Words are not the same in gold standard and tagged corpora, at the index", i return 0 if tag1.lower() == tag2.lower(): count += 1 return count * 100.0 / numwords
def computeAccuracy(goldStandardCorpus, taggedCorpus): tagged = open(taggedCorpus, "r").read().split() goldStandard = open(goldStandardCorpus, "r").read().split() if len(tagged) != len(goldStandard): print "The numbers of word tokens in %s and %s are not equal!" % ( goldStandardCorpus, taggedCorpus) return 0 numwords = 0 count = 0 for i in range(len(tagged)): numwords += 1 word1, tag1 = getWordTag(tagged[i]) word2, tag2 = getWordTag(goldStandard[i]) if word1 != word2 and word1 != "''" and word2 != "''": print "Words are not the same in gold standard and tagged corpora, at the index", i return 0 if tag1.lower() == tag2.lower(): count += 1 return count * 100.0 / numwords
def computeAccuracies(fullDictFile, goldStandardCorpus, taggedCorpus): """ Return known-word accuracy, unknown-word accuracy and the overall accuracy """ tagged = open(taggedCorpus, "r").read().split() goldStandard = open(goldStandardCorpus, "r").read().split() if len(tagged) != len(goldStandard): print "The numbers of word tokens in %s and %s are not equal!" % (goldStandardCorpus, taggedCorpus) return 0 fullDICT = readDictionary(fullDictFile) numwords = count = 0 countKN = countUNKN = 0 countCorrectKN = countCorrectUNKN = 0 for i in xrange(len(tagged)): numwords += 1 word1, tag1 = getWordTag(tagged[i]) word2, tag2 = getWordTag(goldStandard[i]) if word1 != word2 and word1 != "''" and word2 != "''": print "Words are not the same in gold standard and tagged corpora, at the index", i return 0 if tag1.lower() == tag2.lower(): count += 1 if word1 in fullDICT: countKN += 1 if tag1.lower() == tag2.lower(): countCorrectKN += 1 else: countUNKN += 1 if tag1.lower() == tag2.lower(): countCorrectUNKN += 1 if countUNKN == 0: return countCorrectKN * 100.0 / countKN, 0.0, count * 100.0 / numwords else: return countCorrectKN * 100.0 / countKN, countCorrectUNKN * 100.0 / countUNKN, count * 100.0 / numwords
def tagRawVnSentence(self, DICT, rawLine): line = initializeVnSentence(DICT, rawLine) sen = [] wordTags = line.split() for i in range(len(wordTags)): fwObject = FWObject.getFWObject(wordTags, i) word, tag = getWordTag(wordTags[i]) node = self.findFiredNode(fwObject) if node.depth > 0: sen.append(word + "/" + node.conclusion) else: # Fired at root, return initialized tag sen.append(word + "/" + tag) return " ".join(sen)
def tagRawEnSentence(self, DICT, rawLine): line = initializeEnSentence(DICT, rawLine) sen = [] wordTags = line.split() for i in xrange(len(wordTags)): fwObject = FWObject.getFWObject(wordTags, i) word, tag = getWordTag(wordTags[i]) node = self.findFiredNode(fwObject) if node.depth > 0: sen.append(word + "/" + node.conclusion) else:# Fired at root, return initialized tag sen.append(word + "/" + tag) return " ".join(sen)
def compare(taggedCorpus, goldenCorpus): """ To calculate the the performance on POS tagging """ outputTokens = open(taggedCorpus, "r").read().split() standardTokens = open(goldenCorpus, "r").read().split() if len(outputTokens) != len(standardTokens): print "The numbers of tokens are not equal!" return 0 numwords = 0 count = 0 for i in xrange(len(outputTokens)): numwords += 1 word1, tag1 = getWordTag(outputTokens[i]) word2, tag2 = getWordTag(standardTokens[i]) if word1 != word2: print "Data not equal in position", i print outputTokens[i], standardTokens[i-1], standardTokens[i], standardTokens[i+1] return 0 if tag1.lower() == tag2.lower(): count += 1 #else: # print outputTokens[i-1], outputTokens[i], outputTokens[i+1], "<=>", standardTokens[i-1], standardTokens[i], standardTokens[i+1] return count * 100 / float(len(outputTokens))
def tagRawSentence(self, DICT, rawLine): # set DICT to "self"to use a preloaded dictionary if DICT == "self": DICT = self.DICT line = initializeSentence(DICT, rawLine) sen = [] wordTags = line.split() for i in range(len(wordTags)): fwObject = FWObject.getFWObject(wordTags, i) word, tag = getWordTag(wordTags[i]) node = self.findFiredNode(fwObject) if node.depth > 0: sen.append(word + "/" + node.conclusion) else: # Fired at root, return initialized tag sen.append(word + "/" + tag) return " ".join(sen)
def createLexicon(corpusFilePath, fullLexicon): if fullLexicon not in ['full', 'short']: print "The second parameter gets 'full' or 'short' string-value!" print "No lexicon is generated!!!" return #elif fullLexicon == 'full': # print "Creating a full .DICT lexicon from the gold standard training corpus", corpusFilePath #else: # print "Creating a short .sDict lexicon which excludes word types appearing 1 time in the gold standard training corpus" lines = open(corpusFilePath, "r").readlines() wordTagCounter = {} for i in xrange(len(lines)): # print i pairs = lines[i].strip().replace("“", "''").replace("”", "''").replace("\"", "''").split() for pair in pairs: word, tag = getWordTag(pair) if (len(word) >= (len(pair) - 1)) or (len(tag) >= (len(pair) - 1)): print "Incorrectly formatted " + str(i+1) + "th sentence at:", pair else: add2WordTagFreqDict(word, tag, wordTagCounter) from operator import itemgetter dictionary = {} suffixDictCounter = {} tagCounter_Alphabet = {} tagCounter_CapitalizedWord = {} tagCounter_Numeric = {} for word in wordTagCounter: tagFreq4Word = wordTagCounter[word] pairs = tagFreq4Word.items() pairs.sort(key = itemgetter(1), reverse = True) tag = pairs[0][0] decodedWord = word.decode("utf-8") isCapital = decodedWord[0].isupper() if fullLexicon == 'full': dictionary[word] = tag else:# Get the lexicon without 1-time-occurrence word types if (len(pairs) == 1 and pairs[0][1] > 1) or len(pairs) > 1: dictionary[word] = tag if re.search(r"[0-9]+", word) != None: if tag not in tagCounter_Numeric: tagCounter_Numeric[tag] = 1 else: tagCounter_Numeric[tag] += 1 else: if isCapital: if tag not in tagCounter_CapitalizedWord: tagCounter_CapitalizedWord[tag] = 1 else: tagCounter_CapitalizedWord[tag] += 1 else: if tag not in tagCounter_Alphabet: tagCounter_Alphabet[tag] = 1 else: tagCounter_Alphabet[tag] += 1 if len(decodedWord) >= 4: suffix = ".*" + decodedWord[-3:] add2WordTagFreqDict(suffix.encode("utf-8"), tag, suffixDictCounter) suffix = ".*" + decodedWord[-2:] add2WordTagFreqDict(suffix.encode("utf-8"), tag, suffixDictCounter) if len(decodedWord) >= 5: suffix = ".*" + decodedWord[-4:] add2WordTagFreqDict(suffix.encode("utf-8"), tag, suffixDictCounter) if len(decodedWord) >= 6: suffix = ".*" + decodedWord[-5:] add2WordTagFreqDict(suffix.encode("utf-8"), tag, suffixDictCounter) from collections import OrderedDict dictionary = OrderedDict(sorted(dictionary.iteritems(), key = itemgetter(0))) # Get the most frequent tag in the lexicon to label unknown words and numbers tagCounter_Alphabet = OrderedDict(sorted(tagCounter_Alphabet.iteritems(), key = itemgetter(1), reverse = True)) tagCounter_CapitalizedWord = OrderedDict(sorted(tagCounter_CapitalizedWord.iteritems(), key = itemgetter(1), reverse = True)) tagCounter_Numeric = OrderedDict(sorted(tagCounter_Numeric.iteritems(), key = itemgetter(1), reverse = True)) tag4UnknWord = tagCounter_Alphabet.keys()[0] tag4UnknCapitalizedWord = tag4UnknWord tag4UnknNum = tag4UnknWord if len(tagCounter_CapitalizedWord) > 0: tag4UnknCapitalizedWord = tagCounter_CapitalizedWord.keys()[0] if len(tagCounter_Numeric) > 0: tag4UnknNum = tagCounter_Numeric.keys()[0] # Write to file fileSuffix = ".sDict" if fullLexicon == 'full': fileSuffix = ".DICT" fileOut = open(corpusFilePath + fileSuffix, "w") fileOut.write("TAG4UNKN-WORD " + tag4UnknWord + "\n") fileOut.write("TAG4UNKN-CAPITAL " + tag4UnknCapitalizedWord + "\n") fileOut.write("TAG4UNKN-NUM " + tag4UnknNum + "\n") for key in dictionary: fileOut.write(key + " " + dictionary[key] + "\n") for suffix in suffixDictCounter: tagFreq4Suffix = suffixDictCounter[suffix] pairs = tagFreq4Suffix.items() pairs.sort(key = itemgetter(1), reverse = True) tag = pairs[0][0] freq = pairs[0][1] if len(suffix) == 7 and freq >= 2: fileOut.write(suffix + " " + tag + "\n") if len(suffix) == 6 and freq >= 3: fileOut.write(suffix + " " + tag + "\n") if len(suffix) == 5 and freq >= 4: fileOut.write(suffix + " " + tag + "\n") if len(suffix) == 4 and freq >= 5: fileOut.write(suffix + " " + tag + "\n") fileOut.close()
def createLexicon(corpusFile, outDictName, fullLexicon): """ Generate a dictionary from a golden corpus 'corpusFile': Output is a file consisting of lines in which each of them contains a word and the most frequent associated tag corpusFile: path to the golden training corpus outDictName: file name of the dictionary/lexicon fullLexicon: gets True or False value. If it is False, the output lexicon does not contain 1 time occurrence words """ if fullLexicon not in ['True', 'False']: print "the third parameter gets \"True\" or \"False\" string-value!!!" return lines = open(corpusFile, "r").readlines() tagCounter = {} dic = {} for i in xrange(len(lines)): #print i pairs = lines[i].strip().split() for pair in pairs: word, tag = getWordTag(pair) if word not in dic: dic[word] = {} dic[word][tag] = 1 else: if tag not in dic[word]: dic[word][tag] = 1 else: dic[word][tag] = dic[word][tag] + 1 if tag not in tagCounter: tagCounter[tag] = 1 else: tagCounter[tag] = tagCounter[tag] + 1 # Get the most frequent tag associated to each word from operator import itemgetter dictionary = {} for word in dic: tagFreqDic = dic[word] if len(tagFreqDic.keys()) == 1: if fullLexicon == 'True': # Get the full lexicon including 1 time occurrence words dictionary[word] = tagFreqDic.keys()[0] else: # Get the lexicon without 1 time occurrence words if tagFreqDic.values()[0] > 1: dictionary[word] = tagFreqDic.keys()[0] else: pairs = tagFreqDic.items() pairs.sort(key = itemgetter(1), reverse=True) dictionary[word] = pairs[0][0] from collections import OrderedDict dictionary = OrderedDict(sorted(dictionary.iteritems(), key=itemgetter(0))) # Get the most frequent tag in the training corpus pairs = tagCounter.items() pairs.sort(key = itemgetter(1), reverse=True) mostFreqTag = pairs[0][0] #Write to file fileOut = open(outDictName, "w") fileOut.write("DefaultTag " + mostFreqTag + "\n") for key in dictionary: fileOut.write(key + " " + dictionary[key] + "\n") fileOut.close() return dictionary
def get_word_tag(self, word): return getWordTag(word)