def process(self, document): assert(isinstance(document, Document)) tokenizedSentences = [] taggedSentences = [] parsedSentences = [] for sentence in document.rawSentences: # shorten word longer than words = sentence.split() words = [w[:MAX_WORDLEN] for w in words] sent = '' for w in words: sent += w + ' ' #print repr(sentence) #print repr(sent) #print sentence = sent tokenized = [] tokenizedTagged = [] parsed = [] if type(sentence) == unicode: geniaResult = geniatagger.tag_sentence(sentence.encode('utf-8')) else: geniaResult = geniatagger.tag_sentence(sentence) for wordTags in geniaResult: word, base, POStag, chunktag, NEtag = wordTags[0], wordTags[1], wordTags[2], wordTags[3], wordTags[4] tokenized.append(word) tokenizedTagged.append((word, POStag)) parsed.append((word, POStag, chunktag)) #end tokenizedSentences.append(tokenized) taggedSentences.append(tokenizedTagged) #parsedSentences.append(nltk.chunk.util.conlltags2tree(parsed)) parsedSentences.append(parsed) #end document.tokenizedSentences = tokenizedSentences document.taggedSentences = taggedSentences document.parsedSentences = parsedSentences
def processSentence(self, sentence): tokenized = [] tokenizedTagged = [] parsed = [] geniaResult = geniatagger.tag_sentence(sentence) for wordTags in geniaResult: word, base, POStag, chunktag, NEtag = wordTags[0], wordTags[1], wordTags[2], wordTags[3], wordTags[4] tokenized.append(word) tokenizedTagged.append((word, POStag)) parsed.append((word, POStag, chunktag)) #end #return tokenized, tokenizedTagged, nltk.chunk.util.conlltags2tree(parsed) return tokenized, tokenizedTagged, parsed