class NgramTagger(object):
    """ Trigram tagger
    """
    
    def __init__(self):
        self.tagger = None
    
    def train(self,sentence_list):
        """
        """
        noun_fallback = DefaultTagger('NN')
        affix_fallback = AffixTagger(sentence_list,
            backoff=noun_fallback)
        unigram_fallback = UnigramTagger(sentence_list,
            backoff=affix_fallback)
        bigram_fallback = BigramTagger(sentence_list,
            backoff=unigram_fallback)
        self.tagger = TrigramTagger(sentence_list,
            backoff=bigram_fallback)
    
    def tag(self,words):
        """
        """
        if not self.tagger:
            raise Exception("Trigram Tagger not trained.")
        return self.tagger.tag(words)
            
            
            
Beispiel #2
0
class SubjectTrigramTagger(object):
    def __init__(self, train_sents):
        t0 = DefaultTagger('NN')
        t1 = UnigramTagger(train_sents, backoff=t0)
        t2 = BigramTagger(train_sents, backoff=t1)
        self.tagger = TrigramTagger(train_sents, backoff=t2)

    def tag(self, tokens):
        return self.tagger.tag(tokens)
Beispiel #3
0
class Tagger(object):
    def __init__(self, mode, train_sents):
        if mode == TRIGRAM:
            self.tagger = UnigramTagger(train_sents)
            self.tagger = BigramTagger(train_sents, backoff=self.tagger)
            self.tagger = TrigramTagger(train_sents, backoff=self.tagger)
        elif HDM:
            self.tagger = HiddenMarkovModelTagger.train(train_sents)

    def tag(self, sentence):
        sentence_tokens = nltk.word_tokenize(sentence)
        return self.tagger.tag(sentence_tokens)
class SimpleChunkParser(ChunkParserI):
    def __init__(self, trainingChunkedSents):
        trainingData = [
            [(posTag, bioTag) for word, posTag, bioTag in tree2conlltags(chunkedSent)]
            for chunkedSent in trainingChunkedSents 
        ]
        self.tagger = TrigramTagger(trainingData)

    def parse(self, sent):
        posTags = [posTag for (word, posTag) in sent]
        bioTags = [bioTag for (posTag, bioTag) in self.tagger.tag(posTags)]
        chunkedSent = [(word, posTag, bioTag) for ((word, posTag), bioTag) in zip(sent, bioTags)]
        return conlltags2tree(chunkedSent)
class TrigramChunkParser(ChunkParserI):
    def __init__(self, train_sents):
        # Extract only the(POS-TAG, IOB-CHUNK-TAG) pairs
        train_data = [[(pos_tag, chunk_tag)
                       for word, pos_tag, chunk_tag in tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = TrigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for word, pos in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        conlltags = [(word, pos_tag, chunk_tag)
                     for ((word, pos_tag),
                          (pos_tag,
                           chunk_tag)) in zip(sentence, tagged_pos_tags)]
        return conlltags2tree(conlltags)
class SubjectTrigramTagger(object):
    """ Creates an instance of NLTKs TrigramTagger with a backoff
    tagger of a bigram tagger a unigram tagger and a default tagger that sets
    all words to nouns (NN)
    """
    def __init__(self, train_sents):
        """
        train_sents: trained sentences which have already been tagged.
                Currently using Brown, conll2000, and TreeBank corpuses
        """

        t0 = DefaultTagger('NN')
        t1 = UnigramTagger(train_sents, backoff=t0)
        t2 = BigramTagger(train_sents, backoff=t1)
        self.tagger = TrigramTagger(train_sents, backoff=t2)

    def tag(self, tokens):
        return self.tagger.tag(tokens)
class TrigramChunkParser(ChunkParserI):
    def __init__(self, train_sents):
        # Extract only the (POS-TAG, IOB-CHUNK-TAG) pairs
        train_data = [[(pos_tag, chunk_tag) for word, pos_tag, chunk_tag in tree2conlltags(sent)] 
                      for sent in train_sents]
 
        # Train a TrigramTagger
        self.tagger = TrigramTagger(train_data)
 
    def parse(self, sentence):
        pos_tags = [pos for word, pos in sentence]
 
        # Get the Chunk tags
        tagged_pos_tags = self.tagger.tag(pos_tags)
 
        # Assemble the (word, pos, chunk) triplets
        conlltags = [(word, pos_tag, chunk_tag) 
                     for ((word, pos_tag), (pos_tag, chunk_tag)) in zip(sentence, tagged_pos_tags)]
 
        # Transform to tree
        return conlltags2tree(conlltags)
Beispiel #8
0
class Chunker(nltk.ChunkParserI):
    def __init__(self, train_sents, to_detect_list, n_gram=1):
        train_data = [[(t, c) for w, t, c in sent] for sent in train_sents]

        self.tagger = UnigramTagger(train_data)
        if n_gram > 1:
            self.tagger = BigramTagger(train_data, backoff=self.tagger)
        if n_gram > 2:
            self.tagger = TrigramTagger(train_data, backoff=self.tagger)
        self.to_detect_list = to_detect_list

    def traverse_to_dic(self, t, dicc):
        try:
            t.label()
        except AttributeError:
            dicc.append(list(t)[0])
        else:
            new_list = []
            new_dicc = {t.label(): new_list}
            dicc.append(new_dicc)
            for child in t:
                self.traverse_to_dic(child, new_list)

        return None

    def parse(self, sentence):
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag)
                     for ((word, pos), chunktag) in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

    def predict(self, sentence):
        chunked_sentence = self.parse(sentence)
        dic = []
        self.traverse_to_dic(chunked_sentence, dic)
        return dic