def tagBigboss(unsegSentences): enLearner = learner.Learner() enTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt' enTrainSents = Reader.readTaggsedSentences(enTrainData) enLearner.train(enTrainSents) hiLearner = learner.Learner() hiTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt' hiTrainSents = Reader.readTaggsedSentences(hiTrainData) hiLearner.train(hiTrainSents) SS = SentenceSplitter() SS.trainLMsDefault() SS.loadSentences(unsegSentences) SS.scoreSentences() SS.splitSentences() testSents = SS.splittedSentences enTestSents = [(testSents[index][0],testSents[index][2],testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'E'] enTagger = tagger.Tagger(enLearner) enTagger.loadTestSentences(map(lambda x:x[0], enTestSents)) enTagger.tag() hiTestSents = [(testSents[index][0],testSents[index][2],testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'H'] hiTagger = tagger.Tagger(hiLearner) hiTagger.loadTestSentences(map(lambda x:x[0],hiTestSents)) hiTagger.tag() print "English Accuracy:", enTagger.accuracy() print "Hindi Accuracy:", hiTagger.accuracy() enCorrect, enTotal = enTagger.getAccuCounts() hiCorrect, hiTotal = hiTagger.getAccuCounts() print "EN Total:", enTotal, "EN Correct:", enCorrect print "HI Total:", hiTotal, "HI Correct:", hiCorrect print "Total Accuracy:", (enCorrect+hiCorrect)*100.0/(enTotal+hiTotal) taggedSentneces = dd(list) assert len(enTestSents) == len(enTagger.outputs) for sentIndex in range(len(enTagger.outputs)): output = enTagger.outputs[sentIndex] taggedSentneces[enTestSents[sentIndex][1]].append((output, enTestSents[sentIndex][2])) assert len(hiTestSents) == len(hiTagger.outputs) for sentIndex in range(len(hiTagger.outputs)): output = hiTagger.outputs[sentIndex] taggedSentneces[hiTestSents[sentIndex][1]].append((output, hiTestSents[sentIndex][2])) for sentId in taggedSentneces: sent = [] taggedChunks = sorted(taggedSentneces[sentId], cmp=lambda x, y:int(x[1]) - int(y[1])) for chunk in taggedChunks: sent.extend(chunk[0]) taggedSentneces[sentId] = sent taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\Bigboss\\longPosAutoTags.csv' writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n') for sent in taggedSentneces.itervalues(): for line in sent: writer.writerow(line) writer.writerow(["", ""])
def tagPure(unsegSentences): unsegSentences = normalizeManually(unsegSentences) testSents = purelySplitSentences(unsegSentences) enLearner = learner.Learner() enTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt' enTrainSents = Reader.readTaggsedSentences(enTrainData) enLearner.train(enTrainSents) hiLearner = learner.Learner() hiTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt' hiTrainSents = Reader.readTaggsedSentences(hiTrainData) hiLearner.train(hiTrainSents) enTestSents = [(testSents[index][0], testSents[index][2], testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'E'] enTagger = tagger.Tagger(enLearner) enTagger.loadTestSentences(map(lambda x:x[0], enTestSents)) # enTagger.tag() enTagger.tagTopK(5) hiTestSents = [(testSents[index][0], testSents[index][2], testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'H'] hiTagger = tagger.Tagger(hiLearner) hiTagger.loadTestSentences(map(lambda x:x[0], hiTestSents)) # hiTagger.tag() hiTagger.tagTopK(5) print "English Accuracy:", enTagger.accuracy() print "Hindi Accuracy:", hiTagger.accuracy() enCorrect, enTotal = enTagger.getAccuCounts() hiCorrect, hiTotal = hiTagger.getAccuCounts() print "EN Total:", enTotal, "EN Correct:", enCorrect print "HI Total:", hiTotal, "HI Correct:", hiCorrect print "Total Accuracy:", (enCorrect + hiCorrect) * 100.0 / (enTotal + hiTotal) taggedSentneces = dd(list) assert len(enTestSents) == len(enTagger.outputs) for sentIndex in range(len(enTagger.outputs)): output = enTagger.outputs[sentIndex] taggedSentneces[enTestSents[sentIndex][1]].append((output, enTestSents[sentIndex][2])) assert len(hiTestSents) == len(hiTagger.outputs) for sentIndex in range(len(hiTagger.outputs)): output = hiTagger.outputs[sentIndex] taggedSentneces[hiTestSents[sentIndex][1]].append((output, hiTestSents[sentIndex][2])) for sentId in taggedSentneces: sent = [] taggedChunks = sorted(taggedSentneces[sentId], cmp=lambda x, y:int(x[1]) - int(y[1])) for chunk in taggedChunks: sent.extend(chunk[0]) taggedSentneces[sentId] = sent return taggedSentneces
def tagWithBigbossTransitions(unsegSentences): taggedSentences = tagPure(unsegSentences) # # Context Viterbi Decoding ## bigbossLearner = learner.Learner() bigbossTrainData = 'C:\\Users\\t-phgad\\Documents\\Project\\Data\\Bigboss\\FromDOcs\\bigbossDev.uni.txt' bigbossSents = Reader.readTaggsedSentences(bigbossTrainData) bigbossLearner.train(bigbossSents) bigbossLearner.laplaceSmoothTransitions() vitDecoder = tagger.ViteriDecoder(bigbossLearner) outputs = [] for _, topKOutput in taggedSentences.iteritems(): sentence = map(lambda x:x[0], topKOutput) topKTags = map(lambda x:x[1], topKOutput) tags = vitDecoder.decodeTopK(topKTags, sentence) output = zip(sentence, tags) outputs.append(output) print "Context Decoding Accuracy:", accuracy(unsegSentences, outputs) return # # Writing taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\FB\\FBContextTagged.csv' writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n') # for sent in taggedSentneces.itervalues(): for sent in outputs: for line in sent: writer.writerow(line) writer.writerow(["", ""])
def tagTaggedSentences(testSents, taggerTags): enLearner = learner.Learner() enTrainData = "C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt" enTrainSents = Reader.readTaggsedSentences(enTrainData) enLearner.train(enTrainSents) hiLearner = learner.Learner() hiTrainData = "C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt" hiTrainSents = Reader.readTaggsedSentences(hiTrainData) hiLearner.train(hiTrainSents) enTestSents = [(testSents[index], index) for index in range(len(testSents)) if taggerTags[index] == "E"] enTagger = tagger.Tagger(enLearner) enTagger.loadTestSentences(map(lambda x: x[0], enTestSents)) enTagger.tag() hiTestSents = [(testSents[index], index) for index in range(len(testSents)) if taggerTags[index] == "H"] hiTagger = tagger.Tagger(hiLearner) hiTagger.loadTestSentences(map(lambda x: x[0], hiTestSents)) hiTagger.tag() print "English Accuracy:", enTagger.accuracy() print "Hindi Accuracy:", hiTagger.accuracy() enCorrect, enTotal = enTagger.getAccuCounts() hiCorrect, hiTotal = hiTagger.getAccuCounts() print "EN Total:", enTotal, "EN Correct:", enCorrect print "HI Total:", hiTotal, "HI Correct:", hiCorrect print "Total Accuracy:", (enCorrect + hiCorrect) * 100.0 / (enTotal + hiTotal) taggedSentences = {} assert len(enTagger.outputs) == len(enTestSents) for sentIndex in range(len(enTagger.outputs)): output = enTagger.outputs[sentIndex] testSent = enTestSents[sentIndex] testSentIndex = testSent[1] taggedSentences[testSentIndex] = output assert len(hiTagger.outputs) == len(hiTestSents) for sentIndex in range(len(hiTagger.outputs)): output = hiTagger.outputs[sentIndex] testSent = hiTestSents[sentIndex] testSentIndex = testSent[1] taggedSentences[testSentIndex] = output for index in range(len(testSents)): print "\n".join(map(lambda x: "\t".join(x), taggedSentences[index])) + "\n"
def tagBigboss(): unsegBigBoss = 'C:\Users\\t-phgad\Documents\Project\Data\Bigboss\FromDocs\CSSentsWithIDs.csv' unsegSentences = Reader.readTaggsedSentencesCSV(unsegBigBoss, 2, 5) unsegSentences = map2UniWithLtag(unsegSentences) #for sent in unsegSentences: # print '\n'.join(map(lambda x:x[1], sent))+'\n' #sys.exit() tagTaggedSentences(unsegSentences)
def tagUntaggedSentences(unsegSentences): testSents = purelySplitUntaggedSentences(unsegSentences) enLearner = learner.Learner() enTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt' enTrainSents = Reader.readTaggsedSentences(enTrainData) enLearner.train(enTrainSents) hiLearner = learner.Learner() hiTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt' hiTrainSents = Reader.readTaggsedSentences(hiTrainData) hiLearner.train(hiTrainSents) enTestSents = [(testSents[index][0], testSents[index][2], testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'E'] enTagger = tagger.Tagger(enLearner) enTagger.loadTestSentences(map(lambda x:x[0], enTestSents)) enTagger.tag() hiTestSents = [(testSents[index][0], testSents[index][2], testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'H'] hiTagger = tagger.Tagger(hiLearner) hiTagger.loadTestSentences(map(lambda x:x[0], hiTestSents)) hiTagger.tag() taggedSentneces = dd(list) assert len(enTestSents) == len(enTagger.outputs) for sentIndex in range(len(enTagger.outputs)): output = enTagger.outputs[sentIndex] taggedSentneces[enTestSents[sentIndex][1]].append((output, enTestSents[sentIndex][2])) assert len(hiTestSents) == len(hiTagger.outputs) for sentIndex in range(len(hiTagger.outputs)): output = hiTagger.outputs[sentIndex] taggedSentneces[hiTestSents[sentIndex][1]].append((output, hiTestSents[sentIndex][2])) for sentId in taggedSentneces: sent = [] taggedChunks = sorted(taggedSentneces[sentId], cmp=lambda x, y:int(x[1]) - int(y[1])) for chunk in taggedChunks: sent.extend(chunk[0]) taggedSentneces[sentId] = sent taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\cominedWordsLangsTagged.csv' writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n') for sent in taggedSentneces.itervalues(): for line in sent: writer.writerow(line) writer.writerow(["", ""])
def tagBigboss(): bigBossData = "C:\Users\\t-phgad\Documents\Project\Data\Bigboss\FromDocs\CSSentsSegWithIDs.csv" uniTestData = "C:\Users\\t-phgad\Documents\Project\Data\Bigboss\FromDocs\\bigbossDev.uni.txt" # prepareTestDataUni(bigBossData, uniTestData) testSents = Reader.readTaggsedSentences(uniTestData) taggerTags = getTaggerTags(bigBossData) inferredTaggerTags = inferTaggerTags(bigBossData) tagTaggedSentences(testSents, taggerTags) print "Length of tags:", len(taggerTags) print "Length of inferred tags:", len(inferredTaggerTags)
def tagWithManualTaggerTags(unsegSentences): taggedSentences = tagPure(unsegSentences) taggerTags = ["H", "H", "E", "H", "H", "H", "H", "H", "H", "H", "E", "H", "E", "E", "H", "E", "E", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H"] assert len(taggedSentences) == len(taggerTags) enLearner = learner.Learner() enTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt' enTrainSents = Reader.readTaggsedSentences(enTrainData) enLearner.train(enTrainSents) enLearner.laplaceSmoothTransitions() hiLearner = learner.Learner() hiTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt' hiTrainSents = Reader.readTaggsedSentences(hiTrainData) hiLearner.train(hiTrainSents) hiLearner.laplaceSmoothTransitions() hiVitDecoder = tagger.ViteriDecoder(hiLearner) enVitDecoder = tagger.ViteriDecoder(enLearner) outputs = [] index = 0 for _, topKOutput in taggedSentences.iteritems(): sentence = map(lambda x:x[0], topKOutput) topKTags = map(lambda x:x[1], topKOutput) if taggerTags[index] == "E": tags = enVitDecoder.decodeTopK(topKTags, sentence) elif taggerTags[index] == "H": tags = hiVitDecoder.decodeTopK(topKTags, sentence) output = zip(sentence, tags) outputs.append(output) index += 1 print "Context Decoding Accuracy:", accuracy(unsegSentences, outputs) taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\FB\\FBContextMLTagged.csv' writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n') # for sent in taggedSentneces.itervalues(): for sent in outputs: for line in sent: writer.writerow(line) writer.writerow(["", ""])
assert len(enTestSents) == len(enTagger.outputs) for sentIndex in range(len(enTagger.outputs)): output = enTagger.outputs[sentIndex] taggedSentneces[enTestSents[sentIndex][1]].append((output, enTestSents[sentIndex][2])) assert len(hiTestSents) == len(hiTagger.outputs) for sentIndex in range(len(hiTagger.outputs)): output = hiTagger.outputs[sentIndex] taggedSentneces[hiTestSents[sentIndex][1]].append((output, hiTestSents[sentIndex][2])) for sentId in taggedSentneces: sent = [] taggedChunks = sorted(taggedSentneces[sentId], cmp=lambda x, y:int(x[1]) - int(y[1])) for chunk in taggedChunks: sent.extend(chunk[0]) taggedSentneces[sentId] = sent taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\Bigboss\\longPosAutoTags.csv' writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n') for sent in taggedSentneces.itervalues(): for line in sent: writer.writerow(line) writer.writerow(["", ""]) if __name__ == '__main__': unsegBigBoss = 'C:\Users\\t-phgad\Documents\Project\Data\Bigboss\FromDocs\CSSentsWithIDs.csv' unsegSentences = Reader.readTaggsedSentencesCSV(unsegBigBoss, 2, 5) unsegSentences = map2UniWithLtag(unsegSentences) # prepareTestDataUni(bigBossData, uniTestData) tagBigboss(unsegSentences)
def tagFB(): exractedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\FB\\FBPOSAnnotated.csv' unsegSentences = Reader.readTaggsedSentencesCSV(exractedFB, 0, 3) #tagUntaggedSentences(unsegSentences) tagTaggedSentences(unsegSentences)
def tagFB(): exractedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\FB\\FBPOSAnnotated.csv' unsegSentences = Reader.readTaggsedSentencesCSV(exractedFB, 0, 3) #tagWithBigbossTransitions(unsegSentences) tagWithManualTaggerTags(unsegSentences)
def loadDictionaries(self): enTaggedData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt' hiTaggedData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt' enWordTags = Reader.readTaggsedSentences(enTaggedData) hiWordTags = Reader.readTaggsedSentences(hiTaggedData) self.learner.guesser.loadDicts(enWordTags, hiWordTags)
def loadTestSentencesFromFile(self, inputFile): self.testSentences = Reader.readTaggsedSentences(inputFile)