def tagBigboss(): unsegBigBoss = 'C:\Users\\t-phgad\Documents\Project\Data\Bigboss\FromDocs\CSSentsWithIDs.csv' unsegSentences = Reader.readTaggsedSentencesCSV(unsegBigBoss, 2, 5) unsegSentences = map2UniWithLtag(unsegSentences) #for sent in unsegSentences: # print '\n'.join(map(lambda x:x[1], sent))+'\n' #sys.exit() tagTaggedSentences(unsegSentences)
def prepareTestDataUni(bigBossData, uniTestData): reader = csv.reader(open(bigBossData)) reader.next() sentences = [] sentence = [] for line in reader: if line[0] == '': sentences.append(sentence) sentence = [] continue wordTagLang = (line[2], line[3], line[4]) # print wordTagLang sentence.append(wordTagLang) if len(sentence) > 0: sentences.append(sentence) uniSentences = map2Uni.map2UniWithLtag(sentences) Writer.writePOSSentencesTSV(uniSentences, uniTestData)
assert len(enTestSents) == len(enTagger.outputs) for sentIndex in range(len(enTagger.outputs)): output = enTagger.outputs[sentIndex] taggedSentneces[enTestSents[sentIndex][1]].append((output, enTestSents[sentIndex][2])) assert len(hiTestSents) == len(hiTagger.outputs) for sentIndex in range(len(hiTagger.outputs)): output = hiTagger.outputs[sentIndex] taggedSentneces[hiTestSents[sentIndex][1]].append((output, hiTestSents[sentIndex][2])) for sentId in taggedSentneces: sent = [] taggedChunks = sorted(taggedSentneces[sentId], cmp=lambda x, y:int(x[1]) - int(y[1])) for chunk in taggedChunks: sent.extend(chunk[0]) taggedSentneces[sentId] = sent taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\Bigboss\\longPosAutoTags.csv' writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n') for sent in taggedSentneces.itervalues(): for line in sent: writer.writerow(line) writer.writerow(["", ""]) if __name__ == '__main__': unsegBigBoss = 'C:\Users\\t-phgad\Documents\Project\Data\Bigboss\FromDocs\CSSentsWithIDs.csv' unsegSentences = Reader.readTaggsedSentencesCSV(unsegBigBoss, 2, 5) unsegSentences = map2UniWithLtag(unsegSentences) # prepareTestDataUni(bigBossData, uniTestData) tagBigboss(unsegSentences)