def tagBigboss(unsegSentences): enLearner = learner.Learner() enTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt' enTrainSents = Reader.readTaggsedSentences(enTrainData) enLearner.train(enTrainSents) hiLearner = learner.Learner() hiTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt' hiTrainSents = Reader.readTaggsedSentences(hiTrainData) hiLearner.train(hiTrainSents) SS = SentenceSplitter() SS.trainLMsDefault() SS.loadSentences(unsegSentences) SS.scoreSentences() SS.splitSentences() testSents = SS.splittedSentences enTestSents = [(testSents[index][0],testSents[index][2],testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'E'] enTagger = tagger.Tagger(enLearner) enTagger.loadTestSentences(map(lambda x:x[0], enTestSents)) enTagger.tag() hiTestSents = [(testSents[index][0],testSents[index][2],testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'H'] hiTagger = tagger.Tagger(hiLearner) hiTagger.loadTestSentences(map(lambda x:x[0],hiTestSents)) hiTagger.tag() print "English Accuracy:", enTagger.accuracy() print "Hindi Accuracy:", hiTagger.accuracy() enCorrect, enTotal = enTagger.getAccuCounts() hiCorrect, hiTotal = hiTagger.getAccuCounts() print "EN Total:", enTotal, "EN Correct:", enCorrect print "HI Total:", hiTotal, "HI Correct:", hiCorrect print "Total Accuracy:", (enCorrect+hiCorrect)*100.0/(enTotal+hiTotal) taggedSentneces = dd(list) assert len(enTestSents) == len(enTagger.outputs) for sentIndex in range(len(enTagger.outputs)): output = enTagger.outputs[sentIndex] taggedSentneces[enTestSents[sentIndex][1]].append((output, enTestSents[sentIndex][2])) assert len(hiTestSents) == len(hiTagger.outputs) for sentIndex in range(len(hiTagger.outputs)): output = hiTagger.outputs[sentIndex] taggedSentneces[hiTestSents[sentIndex][1]].append((output, hiTestSents[sentIndex][2])) for sentId in taggedSentneces: sent = [] taggedChunks = sorted(taggedSentneces[sentId], cmp=lambda x, y:int(x[1]) - int(y[1])) for chunk in taggedChunks: sent.extend(chunk[0]) taggedSentneces[sentId] = sent taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\Bigboss\\longPosAutoTags.csv' writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n') for sent in taggedSentneces.itervalues(): for line in sent: writer.writerow(line) writer.writerow(["", ""])
def tagWithBigbossTransitions(unsegSentences): taggedSentences = tagPure(unsegSentences) # # Context Viterbi Decoding ## bigbossLearner = learner.Learner() bigbossTrainData = 'C:\\Users\\t-phgad\\Documents\\Project\\Data\\Bigboss\\FromDOcs\\bigbossDev.uni.txt' bigbossSents = Reader.readTaggsedSentences(bigbossTrainData) bigbossLearner.train(bigbossSents) bigbossLearner.laplaceSmoothTransitions() vitDecoder = tagger.ViteriDecoder(bigbossLearner) outputs = [] for _, topKOutput in taggedSentences.iteritems(): sentence = map(lambda x:x[0], topKOutput) topKTags = map(lambda x:x[1], topKOutput) tags = vitDecoder.decodeTopK(topKTags, sentence) output = zip(sentence, tags) outputs.append(output) print "Context Decoding Accuracy:", accuracy(unsegSentences, outputs) return # # Writing taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\FB\\FBContextTagged.csv' writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n') # for sent in taggedSentneces.itervalues(): for sent in outputs: for line in sent: writer.writerow(line) writer.writerow(["", ""])
def tagUntaggedSentences(unsegSentences): testSents = purelySplitUntaggedSentences(unsegSentences) enLearner = learner.Learner() enTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt' enTrainSents = Reader.readTaggsedSentences(enTrainData) enLearner.train(enTrainSents) hiLearner = learner.Learner() hiTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt' hiTrainSents = Reader.readTaggsedSentences(hiTrainData) hiLearner.train(hiTrainSents) enTestSents = [(testSents[index][0], testSents[index][2], testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'E'] enTagger = tagger.Tagger(enLearner) enTagger.loadTestSentences(map(lambda x:x[0], enTestSents)) enTagger.tag() hiTestSents = [(testSents[index][0], testSents[index][2], testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'H'] hiTagger = tagger.Tagger(hiLearner) hiTagger.loadTestSentences(map(lambda x:x[0], hiTestSents)) hiTagger.tag() taggedSentneces = dd(list) assert len(enTestSents) == len(enTagger.outputs) for sentIndex in range(len(enTagger.outputs)): output = enTagger.outputs[sentIndex] taggedSentneces[enTestSents[sentIndex][1]].append((output, enTestSents[sentIndex][2])) assert len(hiTestSents) == len(hiTagger.outputs) for sentIndex in range(len(hiTagger.outputs)): output = hiTagger.outputs[sentIndex] taggedSentneces[hiTestSents[sentIndex][1]].append((output, hiTestSents[sentIndex][2])) for sentId in taggedSentneces: sent = [] taggedChunks = sorted(taggedSentneces[sentId], cmp=lambda x, y:int(x[1]) - int(y[1])) for chunk in taggedChunks: sent.extend(chunk[0]) taggedSentneces[sentId] = sent taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\cominedWordsLangsTagged.csv' writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n') for sent in taggedSentneces.itervalues(): for line in sent: writer.writerow(line) writer.writerow(["", ""])
def tagWithManualTaggerTags(unsegSentences): taggedSentences = tagPure(unsegSentences) taggerTags = ["H", "H", "E", "H", "H", "H", "H", "H", "H", "H", "E", "H", "E", "E", "H", "E", "E", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H"] assert len(taggedSentences) == len(taggerTags) enLearner = learner.Learner() enTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt' enTrainSents = Reader.readTaggsedSentences(enTrainData) enLearner.train(enTrainSents) enLearner.laplaceSmoothTransitions() hiLearner = learner.Learner() hiTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt' hiTrainSents = Reader.readTaggsedSentences(hiTrainData) hiLearner.train(hiTrainSents) hiLearner.laplaceSmoothTransitions() hiVitDecoder = tagger.ViteriDecoder(hiLearner) enVitDecoder = tagger.ViteriDecoder(enLearner) outputs = [] index = 0 for _, topKOutput in taggedSentences.iteritems(): sentence = map(lambda x:x[0], topKOutput) topKTags = map(lambda x:x[1], topKOutput) if taggerTags[index] == "E": tags = enVitDecoder.decodeTopK(topKTags, sentence) elif taggerTags[index] == "H": tags = hiVitDecoder.decodeTopK(topKTags, sentence) output = zip(sentence, tags) outputs.append(output) index += 1 print "Context Decoding Accuracy:", accuracy(unsegSentences, outputs) taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\FB\\FBContextMLTagged.csv' writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n') # for sent in taggedSentneces.itervalues(): for sent in outputs: for line in sent: writer.writerow(line) writer.writerow(["", ""])