class NamedEntityChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)

        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(train=train_sents,
                                            feature_detector=features,
                                            **kwargs)

    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)

    def parse_tweets(self, tweets):
        regex = re.compile(
            '[,#@`:)(\[\]\'%^~=&*+/;<>{}|!?._]|http[,#@`\-:)(\[\]\'%^=&_*+/;<>{}|.!?a-z]*'
        )
        named_entities_tree = ''
        for tweet in tweets:
            text = str.lower(str(tweet.processed_text))
            text = regex.sub('', text)
            current_tree = self.parse(pos_tag(word_tokenize(text)))
            named_entities_tree += str(current_tree)
        return named_entities_tree
Exemple #2
0
class Chunker(nltk.chunk.ChunkParserI):
    '''
    Chunker for SCLE. Only chunks NP for now.
    '''
    def __init__(self):

        train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
        ctagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
        test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
        self._test_sents = [[((w,t), c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in test_sents]
        self._tagger = ClassifierBasedTagger(train=ctagged_sents, feature_detector=npchunk_features)

    def chunk(self, sentences):
        '''
        '''
        chunked_sents = []
        for sent in sentences:
            c_sent = self._tagger.tag(sent)
            conlltags =[(w,t,c) for ((w,t),c) in c_sent]
            chunked_sents.append(nltk.chunk.conlltags2tree(conlltags))
        return chunked_sents
   
    def evaluate(self):
        '''
        Evaluate the chunker.
        '''
        print self._tagger.evaluate(self._test_sents)
Exemple #3
0
class NamedEntityChunker(ChunkParserI):
    """Class with overridden parser and init. This class is equipped to learn and predict given
    training data. The data is [[()]]

    """
    def __init__(self, train_sents, feat_detector, **kwargs):

        assert isinstance(train_sents, Iterable)

        self.feature_detector = feat_detector
        self.tagger = ClassifierBasedTagger(train=train_sents,
                                            feature_detector=feat_detector,
                                            **kwargs)

    def parse(self, tagged_sent):
        """This function is used by evaluate to make guesses and format the guesses
        """
        #make gueess (tag)
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, p, t) for ((w, p), t) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)
Exemple #4
0
class ClassifierChunker(ChunkParserI):
	def __init__(self, train_sents, feature_detector=prev_next_pos_iob, **kwargs):
		if not feature_detector:
			feature_detector = self.feature_detector
		
		train_chunks = chunk_trees2train_chunks(train_sents)
		self.tagger = ClassifierBasedTagger(train=train_chunks,
			feature_detector=feature_detector, **kwargs)
	
	def parse(self, tagged_sent):
		if not tagged_sent: return None
		chunks = self.tagger.tag(tagged_sent)
		return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
class NamedEntityChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)

        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(train=train_sents,
                                            feature_detector=features,
                                            **kwargs)

    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)
class ClassifierChunker(ChunkParserI):  # pylint: disable = W0223
    """
        Classifier-based chunker class implementation
    """
    def __init__(self, train_sents, feature_detector, **kwargs):
        train_chunks = chunk_trees2train_chunks(train_sents)
        self.tagger = ClassifierBasedTagger(train=train_chunks,
                                            feature_detector=feature_detector,
                                            **kwargs)

    def parse(self, tokens):
        """
            Parse sentence into chunks
        """
        if not tokens:
            return None
        chunked = self.tagger.tag(tokens)
        return conlltags2tree([(w, t, c) for ((w, t), c) in chunked])
Exemple #7
0
class NamedEntityChunker(ChunkParserI):
    '''
	Named Entity Chunker using ClassiferBasedTagger() and features generated by features()
	'''
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)

        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(feature_detector=features,
                                            train=train_sents,
                                            **kwargs)

    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform [((w1, t1), iob1), ...] to triplets [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

        return iob_triplets
Exemple #8
0
class NEChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)
        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(
            train=train_sents,
            feature_detector=features,
            **kwargs
        )
    
    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the normalized format of triplets [(w1, t1, iob1), ...]
        iob_triplets = [(word, token, chunk) for ((word, token), chunk) in chunks]

        # Transformthe list of triplets to NLTK tree format
        return conlltags2tree(iob_triplets)
Exemple #9
0
class address_chunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        self.tagger = ClassifierBasedTagger(train=train_sents,
                                            feature_detector=self.features,
                                            **kwargs)

    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
        return conlltags2tree(iob_triplets)

    def features(self, tokens, index, history):
        # for more details see: http://nlpforhackers.io/named-entity-extraction/
        """
        `tokens`  = a POS-tagged sentence [(w1, t1), ...]
        `index`   = the index of the token we want to extract features for
        `history` = the previous predicted IOB tags
        """

        # init the stemmer
        stemmer = SnowballStemmer('english')

        # Pad the sequence with placeholders
        tokens = [
            ('[START2]', '[START2]'), ('[START1]', '[START1]')
        ] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
        history = ['[START2]', '[START1]'] + list(history)

        # shift the index with 2, to accommodate the padding
        index += 2

        word, pos = tokens[index]
        prevword, prevpos = tokens[index - 1]
        prevprevword, prevprevpos = tokens[index - 2]
        nextword, nextpos = tokens[index + 1]
        nextnextword, nextnextpos = tokens[index + 2]
        previob = history[index - 1]
        contains_dash = '-' in word
        contains_dot = '.' in word
        allascii = all([True for c in word if c in string.ascii_lowercase])

        allcaps = word == word.capitalize()
        capitalized = word[0] in string.ascii_uppercase

        prevallcaps = prevword == prevword.capitalize()
        prevcapitalized = prevword[0] in string.ascii_uppercase

        nextallcaps = nextword == nextword.capitalize()
        nextcapitalized = nextword[0] in string.ascii_uppercase

        return {
            'word': word,
            'lemma': stemmer.stem(word),
            'pos': pos,
            'all-ascii': allascii,
            'next-word': nextword,
            'next-lemma': stemmer.stem(nextword),
            'next-pos': nextpos,
            'next-next-word': nextnextword,
            'next-next-pos': nextnextpos,
            'prev-word': prevword,
            'prev-lemma': stemmer.stem(prevword),
            'prev-pos': prevpos,
            'prev-prev-word': prevprevword,
            'prev-prev-pos': prevprevpos,
            'prev-iob': previob,
            'contains-dash': contains_dash,
            'contains-dot': contains_dot,
            'all-caps': allcaps,
            'capitalized': capitalized,
            'prev-all-caps': prevallcaps,
            'prev-capitalized': prevcapitalized,
            'next-all-caps': nextallcaps,
            'next-capitalized': nextcapitalized,
        }

    def save_to_file(self, file_name):
        save_classifier = open(file_name, "wb")
        pickle.dump(self, save_classifier)
        save_classifier.close()

    def chunk(self, sentence):

        tagged_tree = self.parse(pos_tag(word_tokenize(sentence)))

        chunks = []
        for subtree in tagged_tree.subtrees(filter=tree_filter):
            chunks.append(untag(subtree.leaves()))

        max_length = 0
        for i in range(len(chunks)):
            if len(chunks[i]) > max_length:
                chunk = chunks[i]
                max_length = len(chunks[i])

        output = ''
        if len(chunks) > 0:
            for i in range(len(chunk)):
                if not chunk[i] == '.' and not chunk[i] == ',' and not i == 0:
                    output = output + ' ' + chunk[i]
                else:
                    output = output + chunk[i]

        return output
class AddressChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        self.tagger = ClassifierBasedTagger(
            train=train_sents,
            feature_detector=self.features,
            **kwargs)

    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)
    
    def features(self, tokens, index, history):
        # for more details see: http://nlpforhackers.io/named-entity-extraction/ 
        
        """
        `tokens`  = a POS-tagged sentence [(w1, t1), ...]
        `index`   = the index of the token we want to extract features for
        `history` = the previous predicted IOB tags
        """

        # init the stemmer
        stemmer = SnowballStemmer('english')

        # Pad the sequence with placeholders
        tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
        history = ['[START2]', '[START1]'] + list(history)

        # shift the index with 2, to accommodate the padding
        index += 2

        word, pos = tokens[index]
        prevword, prevpos = tokens[index - 1]
        prevprevword, prevprevpos = tokens[index - 2]
        nextword, nextpos = tokens[index + 1]
        nextnextword, nextnextpos = tokens[index + 2]
        previob = history[index - 1]
        contains_dash = '-' in word
        contains_dot = '.' in word
        allascii = all([True for c in word if c in string.ascii_lowercase])

        allcaps = word == word.capitalize()
        capitalized = word[0] in string.ascii_uppercase

        prevallcaps = prevword == prevword.capitalize()
        prevcapitalized = prevword[0] in string.ascii_uppercase

        nextallcaps = prevword == prevword.capitalize()
        nextcapitalized = prevword[0] in string.ascii_uppercase

        f = {
            'word': word,
            'lemma': stemmer.stem(word),
            'pos': pos,
            'all-ascii': allascii,

            'next-word': nextword,
            'next-lemma': stemmer.stem(nextword),
            'next-pos': nextpos,

            'next-next-word': nextnextword,
            'nextnextpos': nextnextpos,

            'prev-word': prevword,
            'prev-lemma': stemmer.stem(prevword),
            'prev-pos': prevpos,

            'prev-prev-word': prevprevword,
            'prev-prev-pos': prevprevpos,

            'prev-iob': previob,

            'contains-dash': contains_dash,
            'contains-dot': contains_dot,

            'all-caps': allcaps,
            'capitalized': capitalized,

            'prev-all-caps': prevallcaps,
            'prev-capitalized': prevcapitalized,

            'next-all-caps': nextallcaps,
            'next-capitalized': nextcapitalized,
        }

        return f
Exemple #11
0
    scikit_classifier = ScikitClassifier(classifier=classifier,
                                         vectorizer=vectorizer)

    return scikit_classifier


#Train Online Learning POS Tagger

if __name__ == "__main__":
    test_data = read_ud_pos_data(
        r'C:\UD_English-EWT-master\en_ewt-ud-dev.conllu')

    start_time = time.time()
    print("Starting training ...")

    tagger = ClassifierBasedTaggerBatchTrained(
        feature_detector=pos_features,
        train=read_ud_pos_data(
            r'C:\UD_English-EWT-master\en_ewt-ud-train.conllu'),
        classifier_builder=lambda iterator, detector:
        incremental_train_scikit_classifier(
            iterator, detector, batch_size=500, max_iterations=100),
    )
    end_time = time.time()
    print("Training complete. Time={0:.2f}s".format(end_time - start_time))

    print("Computing test set accuracy ...")
    print(tagger.evaluate(test_data))  # 0.9255606807698425

    print(tagger.tag("This is a test".split()))