Example #1
0
def tagBigboss(unsegSentences):
  enLearner = learner.Learner()
  enTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt'
  enTrainSents = Reader.readTaggsedSentences(enTrainData)
  enLearner.train(enTrainSents)
  
  hiLearner = learner.Learner()
  hiTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt'
  hiTrainSents = Reader.readTaggsedSentences(hiTrainData)
  hiLearner.train(hiTrainSents)
  
  SS = SentenceSplitter()
  SS.trainLMsDefault()
  SS.loadSentences(unsegSentences)
  SS.scoreSentences()
  SS.splitSentences()
  testSents = SS.splittedSentences
  
  enTestSents = [(testSents[index][0],testSents[index][2],testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'E']
  enTagger = tagger.Tagger(enLearner)
  enTagger.loadTestSentences(map(lambda x:x[0], enTestSents))
  enTagger.tag()
  
  hiTestSents = [(testSents[index][0],testSents[index][2],testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'H']
  hiTagger = tagger.Tagger(hiLearner)
  hiTagger.loadTestSentences(map(lambda x:x[0],hiTestSents))
  hiTagger.tag()
  
  print "English Accuracy:", enTagger.accuracy()
  print "Hindi Accuracy:", hiTagger.accuracy()
  enCorrect, enTotal = enTagger.getAccuCounts()
  hiCorrect, hiTotal = hiTagger.getAccuCounts()
  print "EN Total:", enTotal, "EN Correct:", enCorrect
  print "HI Total:", hiTotal, "HI Correct:", hiCorrect
  print "Total Accuracy:", (enCorrect+hiCorrect)*100.0/(enTotal+hiTotal)
  
  taggedSentneces = dd(list)
  
  assert len(enTestSents) == len(enTagger.outputs)
  for sentIndex in range(len(enTagger.outputs)):
    output = enTagger.outputs[sentIndex]
    taggedSentneces[enTestSents[sentIndex][1]].append((output, enTestSents[sentIndex][2]))
  
  assert len(hiTestSents) == len(hiTagger.outputs)
  for sentIndex in range(len(hiTagger.outputs)):
    output = hiTagger.outputs[sentIndex]
    taggedSentneces[hiTestSents[sentIndex][1]].append((output, hiTestSents[sentIndex][2]))
  
  for sentId in taggedSentneces:
    sent = []
    taggedChunks = sorted(taggedSentneces[sentId], cmp=lambda x, y:int(x[1]) - int(y[1]))
    for chunk in taggedChunks:
      sent.extend(chunk[0])
    taggedSentneces[sentId] = sent
  taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\Bigboss\\longPosAutoTags.csv'
  writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n')
  for sent in taggedSentneces.itervalues():
    for line in sent:
      writer.writerow(line)
    writer.writerow(["", ""])
Example #2
0
def tagWithBigbossTransitions(unsegSentences):
  taggedSentences = tagPure(unsegSentences)

  # # Context Viterbi Decoding ##
  bigbossLearner = learner.Learner()
  bigbossTrainData = 'C:\\Users\\t-phgad\\Documents\\Project\\Data\\Bigboss\\FromDOcs\\bigbossDev.uni.txt'
  bigbossSents = Reader.readTaggsedSentences(bigbossTrainData)
  bigbossLearner.train(bigbossSents)
  bigbossLearner.laplaceSmoothTransitions()
  vitDecoder = tagger.ViteriDecoder(bigbossLearner)
  outputs = []
  for _, topKOutput in taggedSentences.iteritems():
    sentence = map(lambda x:x[0], topKOutput)
    topKTags = map(lambda x:x[1], topKOutput)
    tags = vitDecoder.decodeTopK(topKTags, sentence)
    output = zip(sentence, tags)
    outputs.append(output)
  print "Context Decoding Accuracy:", accuracy(unsegSentences, outputs)
  return
  # # Writing
  taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\FB\\FBContextTagged.csv'
  writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n')
  # for sent in taggedSentneces.itervalues():
  for sent in outputs:
    for line in sent:
      writer.writerow(line)
    writer.writerow(["", ""])
Example #3
0
def tagUntaggedSentences(unsegSentences):
  testSents = purelySplitUntaggedSentences(unsegSentences)
  enLearner = learner.Learner()
  enTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt'
  enTrainSents = Reader.readTaggsedSentences(enTrainData)
  enLearner.train(enTrainSents)
  
  hiLearner = learner.Learner()
  hiTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt'
  hiTrainSents = Reader.readTaggsedSentences(hiTrainData)
  hiLearner.train(hiTrainSents)
  
  enTestSents = [(testSents[index][0], testSents[index][2], testSents[index][3])  for index in range(len(testSents)) if testSents[index][1] == 'E']
  enTagger = tagger.Tagger(enLearner)
  enTagger.loadTestSentences(map(lambda x:x[0], enTestSents))
  enTagger.tag()
  
  hiTestSents = [(testSents[index][0], testSents[index][2], testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'H']
  hiTagger = tagger.Tagger(hiLearner)
  hiTagger.loadTestSentences(map(lambda x:x[0], hiTestSents))
  hiTagger.tag()
  
  taggedSentneces = dd(list)
  
  assert len(enTestSents) == len(enTagger.outputs)
  for sentIndex in range(len(enTagger.outputs)):
    output = enTagger.outputs[sentIndex]
    taggedSentneces[enTestSents[sentIndex][1]].append((output, enTestSents[sentIndex][2]))
  
  assert len(hiTestSents) == len(hiTagger.outputs)
  for sentIndex in range(len(hiTagger.outputs)):
    output = hiTagger.outputs[sentIndex]
    taggedSentneces[hiTestSents[sentIndex][1]].append((output, hiTestSents[sentIndex][2]))
  
  for sentId in taggedSentneces:
    sent = []
    taggedChunks = sorted(taggedSentneces[sentId], cmp=lambda x, y:int(x[1]) - int(y[1]))
    for chunk in taggedChunks:
      sent.extend(chunk[0])
    taggedSentneces[sentId] = sent
  taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\cominedWordsLangsTagged.csv'
  writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n')
  for sent in taggedSentneces.itervalues():
    for line in sent:
      writer.writerow(line)
    writer.writerow(["", ""])
Example #4
0
def tagWithManualTaggerTags(unsegSentences):
  taggedSentences = tagPure(unsegSentences)
  taggerTags = ["H", "H", "E", "H", "H", "H", "H", "H", "H", "H", "E", "H", "E", "E", "H", "E", "E", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H"]
  assert len(taggedSentences) == len(taggerTags)
  
  enLearner = learner.Learner()
  enTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt'
  enTrainSents = Reader.readTaggsedSentences(enTrainData)
  enLearner.train(enTrainSents)
  enLearner.laplaceSmoothTransitions()
  
  hiLearner = learner.Learner()
  hiTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt'
  hiTrainSents = Reader.readTaggsedSentences(hiTrainData)
  hiLearner.train(hiTrainSents)
  hiLearner.laplaceSmoothTransitions()
  
  hiVitDecoder = tagger.ViteriDecoder(hiLearner)
  enVitDecoder = tagger.ViteriDecoder(enLearner)
  outputs = []
  index = 0
  for _, topKOutput in taggedSentences.iteritems():
    sentence = map(lambda x:x[0], topKOutput)
    topKTags = map(lambda x:x[1], topKOutput)
    if taggerTags[index] == "E":
      tags = enVitDecoder.decodeTopK(topKTags, sentence)
    elif taggerTags[index] == "H":
      tags = hiVitDecoder.decodeTopK(topKTags, sentence)
    output = zip(sentence, tags)
    outputs.append(output)
    index += 1
  print "Context Decoding Accuracy:", accuracy(unsegSentences, outputs)
  
  taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\FB\\FBContextMLTagged.csv'
  writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n')
  # for sent in taggedSentneces.itervalues():
  for sent in outputs:
    for line in sent:
      writer.writerow(line)
    writer.writerow(["", ""])