class NgramTagger(object): """ Trigram tagger """ def __init__(self): self.tagger = None def train(self,sentence_list): """ """ noun_fallback = DefaultTagger('NN') affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback) unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback) bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback) self.tagger = TrigramTagger(sentence_list, backoff=bigram_fallback) def tag(self,words): """ """ if not self.tagger: raise Exception("Trigram Tagger not trained.") return self.tagger.tag(words)
class SubjectTrigramTagger(object): def __init__(self, train_sents): t0 = DefaultTagger('NN') t1 = UnigramTagger(train_sents, backoff=t0) t2 = BigramTagger(train_sents, backoff=t1) self.tagger = TrigramTagger(train_sents, backoff=t2) def tag(self, tokens): return self.tagger.tag(tokens)
class Tagger(object): def __init__(self, mode, train_sents): if mode == TRIGRAM: self.tagger = UnigramTagger(train_sents) self.tagger = BigramTagger(train_sents, backoff=self.tagger) self.tagger = TrigramTagger(train_sents, backoff=self.tagger) elif HDM: self.tagger = HiddenMarkovModelTagger.train(train_sents) def tag(self, sentence): sentence_tokens = nltk.word_tokenize(sentence) return self.tagger.tag(sentence_tokens)
class SimpleChunkParser(ChunkParserI): def __init__(self, trainingChunkedSents): trainingData = [ [(posTag, bioTag) for word, posTag, bioTag in tree2conlltags(chunkedSent)] for chunkedSent in trainingChunkedSents ] self.tagger = TrigramTagger(trainingData) def parse(self, sent): posTags = [posTag for (word, posTag) in sent] bioTags = [bioTag for (posTag, bioTag) in self.tagger.tag(posTags)] chunkedSent = [(word, posTag, bioTag) for ((word, posTag), bioTag) in zip(sent, bioTags)] return conlltags2tree(chunkedSent)
class TrigramChunkParser(ChunkParserI): def __init__(self, train_sents): # Extract only the(POS-TAG, IOB-CHUNK-TAG) pairs train_data = [[(pos_tag, chunk_tag) for word, pos_tag, chunk_tag in tree2conlltags(sent)] for sent in train_sents] self.tagger = TrigramTagger(train_data) def parse(self, sentence): pos_tags = [pos for word, pos in sentence] tagged_pos_tags = self.tagger.tag(pos_tags) conlltags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), (pos_tag, chunk_tag)) in zip(sentence, tagged_pos_tags)] return conlltags2tree(conlltags)
class SubjectTrigramTagger(object): """ Creates an instance of NLTKs TrigramTagger with a backoff tagger of a bigram tagger a unigram tagger and a default tagger that sets all words to nouns (NN) """ def __init__(self, train_sents): """ train_sents: trained sentences which have already been tagged. Currently using Brown, conll2000, and TreeBank corpuses """ t0 = DefaultTagger('NN') t1 = UnigramTagger(train_sents, backoff=t0) t2 = BigramTagger(train_sents, backoff=t1) self.tagger = TrigramTagger(train_sents, backoff=t2) def tag(self, tokens): return self.tagger.tag(tokens)
class TrigramChunkParser(ChunkParserI): def __init__(self, train_sents): # Extract only the (POS-TAG, IOB-CHUNK-TAG) pairs train_data = [[(pos_tag, chunk_tag) for word, pos_tag, chunk_tag in tree2conlltags(sent)] for sent in train_sents] # Train a TrigramTagger self.tagger = TrigramTagger(train_data) def parse(self, sentence): pos_tags = [pos for word, pos in sentence] # Get the Chunk tags tagged_pos_tags = self.tagger.tag(pos_tags) # Assemble the (word, pos, chunk) triplets conlltags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), (pos_tag, chunk_tag)) in zip(sentence, tagged_pos_tags)] # Transform to tree return conlltags2tree(conlltags)
class Chunker(nltk.ChunkParserI): def __init__(self, train_sents, to_detect_list, n_gram=1): train_data = [[(t, c) for w, t, c in sent] for sent in train_sents] self.tagger = UnigramTagger(train_data) if n_gram > 1: self.tagger = BigramTagger(train_data, backoff=self.tagger) if n_gram > 2: self.tagger = TrigramTagger(train_data, backoff=self.tagger) self.to_detect_list = to_detect_list def traverse_to_dic(self, t, dicc): try: t.label() except AttributeError: dicc.append(list(t)[0]) else: new_list = [] new_dicc = {t.label(): new_list} dicc.append(new_dicc) for child in t: self.traverse_to_dic(child, new_list) return None def parse(self, sentence): pos_tags = [pos for (word, pos) in sentence] tagged_pos_tags = self.tagger.tag(pos_tags) chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags] conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in zip(sentence, chunktags)] return nltk.chunk.conlltags2tree(conlltags) def predict(self, sentence): chunked_sentence = self.parse(sentence) dic = [] self.traverse_to_dic(chunked_sentence, dic) return dic