class NamedEntityChunker(ChunkParserI): def __init__(self, train_sents, **kwargs): assert isinstance(train_sents, Iterable) self.feature_detector = features self.tagger = ClassifierBasedTagger(train=train_sents, feature_detector=features, **kwargs) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets) def parse_tweets(self, tweets): regex = re.compile( '[,#@`:)(\[\]\'%^~=&*+/;<>{}|!?._]|http[,#@`\-:)(\[\]\'%^=&_*+/;<>{}|.!?a-z]*' ) named_entities_tree = '' for tweet in tweets: text = str.lower(str(tweet.processed_text)) text = regex.sub('', text) current_tree = self.parse(pos_tag(word_tokenize(text))) named_entities_tree += str(current_tree) return named_entities_tree
class Chunker(nltk.chunk.ChunkParserI): ''' Chunker for SCLE. Only chunks NP for now. ''' def __init__(self): train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) ctagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) self._test_sents = [[((w,t), c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in test_sents] self._tagger = ClassifierBasedTagger(train=ctagged_sents, feature_detector=npchunk_features) def chunk(self, sentences): ''' ''' chunked_sents = [] for sent in sentences: c_sent = self._tagger.tag(sent) conlltags =[(w,t,c) for ((w,t),c) in c_sent] chunked_sents.append(nltk.chunk.conlltags2tree(conlltags)) return chunked_sents def evaluate(self): ''' Evaluate the chunker. ''' print self._tagger.evaluate(self._test_sents)
class NamedEntityChunker(ChunkParserI): """Class with overridden parser and init. This class is equipped to learn and predict given training data. The data is [[()]] """ def __init__(self, train_sents, feat_detector, **kwargs): assert isinstance(train_sents, Iterable) self.feature_detector = feat_detector self.tagger = ClassifierBasedTagger(train=train_sents, feature_detector=feat_detector, **kwargs) def parse(self, tagged_sent): """This function is used by evaluate to make guesses and format the guesses """ #make gueess (tag) chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, p, t) for ((w, p), t) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets)
class ClassifierChunker(ChunkParserI): def __init__(self, train_sents, feature_detector=prev_next_pos_iob, **kwargs): if not feature_detector: feature_detector = self.feature_detector train_chunks = chunk_trees2train_chunks(train_sents) self.tagger = ClassifierBasedTagger(train=train_chunks, feature_detector=feature_detector, **kwargs) def parse(self, tagged_sent): if not tagged_sent: return None chunks = self.tagger.tag(tagged_sent) return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
class NamedEntityChunker(ChunkParserI): def __init__(self, train_sents, **kwargs): assert isinstance(train_sents, Iterable) self.feature_detector = features self.tagger = ClassifierBasedTagger(train=train_sents, feature_detector=features, **kwargs) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets)
class ClassifierChunker(ChunkParserI): # pylint: disable = W0223 """ Classifier-based chunker class implementation """ def __init__(self, train_sents, feature_detector, **kwargs): train_chunks = chunk_trees2train_chunks(train_sents) self.tagger = ClassifierBasedTagger(train=train_chunks, feature_detector=feature_detector, **kwargs) def parse(self, tokens): """ Parse sentence into chunks """ if not tokens: return None chunked = self.tagger.tag(tokens) return conlltags2tree([(w, t, c) for ((w, t), c) in chunked])
class NamedEntityChunker(ChunkParserI): ''' Named Entity Chunker using ClassiferBasedTagger() and features generated by features() ''' def __init__(self, train_sents, **kwargs): assert isinstance(train_sents, Iterable) self.feature_detector = features self.tagger = ClassifierBasedTagger(feature_detector=features, train=train_sents, **kwargs) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform [((w1, t1), iob1), ...] to triplets [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] return iob_triplets
class NEChunker(ChunkParserI): def __init__(self, train_sents, **kwargs): assert isinstance(train_sents, Iterable) self.feature_detector = features self.tagger = ClassifierBasedTagger( train=train_sents, feature_detector=features, **kwargs ) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the normalized format of triplets [(w1, t1, iob1), ...] iob_triplets = [(word, token, chunk) for ((word, token), chunk) in chunks] # Transformthe list of triplets to NLTK tree format return conlltags2tree(iob_triplets)
class address_chunker(ChunkParserI): def __init__(self, train_sents, **kwargs): self.tagger = ClassifierBasedTagger(train=train_sents, feature_detector=self.features, **kwargs) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) iob_triplets = [(w, t, c) for ((w, t), c) in chunks] return conlltags2tree(iob_triplets) def features(self, tokens, index, history): # for more details see: http://nlpforhackers.io/named-entity-extraction/ """ `tokens` = a POS-tagged sentence [(w1, t1), ...] `index` = the index of the token we want to extract features for `history` = the previous predicted IOB tags """ # init the stemmer stemmer = SnowballStemmer('english') # Pad the sequence with placeholders tokens = [ ('[START2]', '[START2]'), ('[START1]', '[START1]') ] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')] history = ['[START2]', '[START1]'] + list(history) # shift the index with 2, to accommodate the padding index += 2 word, pos = tokens[index] prevword, prevpos = tokens[index - 1] prevprevword, prevprevpos = tokens[index - 2] nextword, nextpos = tokens[index + 1] nextnextword, nextnextpos = tokens[index + 2] previob = history[index - 1] contains_dash = '-' in word contains_dot = '.' in word allascii = all([True for c in word if c in string.ascii_lowercase]) allcaps = word == word.capitalize() capitalized = word[0] in string.ascii_uppercase prevallcaps = prevword == prevword.capitalize() prevcapitalized = prevword[0] in string.ascii_uppercase nextallcaps = nextword == nextword.capitalize() nextcapitalized = nextword[0] in string.ascii_uppercase return { 'word': word, 'lemma': stemmer.stem(word), 'pos': pos, 'all-ascii': allascii, 'next-word': nextword, 'next-lemma': stemmer.stem(nextword), 'next-pos': nextpos, 'next-next-word': nextnextword, 'next-next-pos': nextnextpos, 'prev-word': prevword, 'prev-lemma': stemmer.stem(prevword), 'prev-pos': prevpos, 'prev-prev-word': prevprevword, 'prev-prev-pos': prevprevpos, 'prev-iob': previob, 'contains-dash': contains_dash, 'contains-dot': contains_dot, 'all-caps': allcaps, 'capitalized': capitalized, 'prev-all-caps': prevallcaps, 'prev-capitalized': prevcapitalized, 'next-all-caps': nextallcaps, 'next-capitalized': nextcapitalized, } def save_to_file(self, file_name): save_classifier = open(file_name, "wb") pickle.dump(self, save_classifier) save_classifier.close() def chunk(self, sentence): tagged_tree = self.parse(pos_tag(word_tokenize(sentence))) chunks = [] for subtree in tagged_tree.subtrees(filter=tree_filter): chunks.append(untag(subtree.leaves())) max_length = 0 for i in range(len(chunks)): if len(chunks[i]) > max_length: chunk = chunks[i] max_length = len(chunks[i]) output = '' if len(chunks) > 0: for i in range(len(chunk)): if not chunk[i] == '.' and not chunk[i] == ',' and not i == 0: output = output + ' ' + chunk[i] else: output = output + chunk[i] return output
class AddressChunker(ChunkParserI): def __init__(self, train_sents, **kwargs): self.tagger = ClassifierBasedTagger( train=train_sents, feature_detector=self.features, **kwargs) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets) def features(self, tokens, index, history): # for more details see: http://nlpforhackers.io/named-entity-extraction/ """ `tokens` = a POS-tagged sentence [(w1, t1), ...] `index` = the index of the token we want to extract features for `history` = the previous predicted IOB tags """ # init the stemmer stemmer = SnowballStemmer('english') # Pad the sequence with placeholders tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')] history = ['[START2]', '[START1]'] + list(history) # shift the index with 2, to accommodate the padding index += 2 word, pos = tokens[index] prevword, prevpos = tokens[index - 1] prevprevword, prevprevpos = tokens[index - 2] nextword, nextpos = tokens[index + 1] nextnextword, nextnextpos = tokens[index + 2] previob = history[index - 1] contains_dash = '-' in word contains_dot = '.' in word allascii = all([True for c in word if c in string.ascii_lowercase]) allcaps = word == word.capitalize() capitalized = word[0] in string.ascii_uppercase prevallcaps = prevword == prevword.capitalize() prevcapitalized = prevword[0] in string.ascii_uppercase nextallcaps = prevword == prevword.capitalize() nextcapitalized = prevword[0] in string.ascii_uppercase f = { 'word': word, 'lemma': stemmer.stem(word), 'pos': pos, 'all-ascii': allascii, 'next-word': nextword, 'next-lemma': stemmer.stem(nextword), 'next-pos': nextpos, 'next-next-word': nextnextword, 'nextnextpos': nextnextpos, 'prev-word': prevword, 'prev-lemma': stemmer.stem(prevword), 'prev-pos': prevpos, 'prev-prev-word': prevprevword, 'prev-prev-pos': prevprevpos, 'prev-iob': previob, 'contains-dash': contains_dash, 'contains-dot': contains_dot, 'all-caps': allcaps, 'capitalized': capitalized, 'prev-all-caps': prevallcaps, 'prev-capitalized': prevcapitalized, 'next-all-caps': nextallcaps, 'next-capitalized': nextcapitalized, } return f
scikit_classifier = ScikitClassifier(classifier=classifier, vectorizer=vectorizer) return scikit_classifier #Train Online Learning POS Tagger if __name__ == "__main__": test_data = read_ud_pos_data( r'C:\UD_English-EWT-master\en_ewt-ud-dev.conllu') start_time = time.time() print("Starting training ...") tagger = ClassifierBasedTaggerBatchTrained( feature_detector=pos_features, train=read_ud_pos_data( r'C:\UD_English-EWT-master\en_ewt-ud-train.conllu'), classifier_builder=lambda iterator, detector: incremental_train_scikit_classifier( iterator, detector, batch_size=500, max_iterations=100), ) end_time = time.time() print("Training complete. Time={0:.2f}s".format(end_time - start_time)) print("Computing test set accuracy ...") print(tagger.evaluate(test_data)) # 0.9255606807698425 print(tagger.tag("This is a test".split()))