def train_and_score(featureset, training_samples, test_samples, other_samples=None): """This function takes in training and test data and a featureset and will return the scores for testing and training data Optional extra set of samples can be sent to be tested against as well """ chunker = NamedEntityChunker(training_samples, featureset) testscore = chunker.evaluate([ conlltags2tree([(w, pos, tag) for (w, pos), tag in sentance]) for sentance in test_samples ]).accuracy() trainscore = chunker.evaluate([ conlltags2tree([(w, pos, tag) for (w, pos), tag in sentance]) for sentance in training_samples ]).accuracy() if other_samples: mixscore = chunker.evaluate([ conlltags2tree([(w, pos, tag) for (w, pos), tag in sentance]) for sentance in other_samples ]).accuracy() return (testscore, trainscore, mixscore) else: return (testscore, trainscore)
def processLanguage(): try: for item in contentArray: tokenized = nltk.word_tokenize(item) stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(item) filtered_sentence = [w for w in word_tokens if not w in stop_words] filtered_sentence = [] for w in word_tokens: if w not in stop_words: filtered_sentence.append(w) #print(word_tokens) print(filtered_sentence) tagged = nltk.pos_tag(filtered_sentence) print(tagged) ne_tree = ne_chunk(tagged) print(ne_tree) iob_tagged = tree2conlltags(ne_tree) print(iob_tagged) ne_tree = conlltags2tree(iob_tagged) print(ne_tree) namedEnt = nltk.ne_chunk(tagged) namedEnt.draw() #break #continue #time.sleep(1) except Exception as e: print(str(e))
def stanford_tree(bio_tagged): tokens, ne_tags = zip(*bio_tagged) pos_tags = [pos for token, pos in pos_tag(tokens)] conlltags = [(token, pos, ne) for token, pos, ne in zip(tokens, pos_tags, ne_tags)] ne_tree = conlltags2tree(conlltags) return ne_tree
def stanfordNE2tree(ne_tagged_sent): bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent) sent_tokens, sent_ne_tags = zip(*bio_tagged_sent) sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)] sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)] ne_tree = conlltags2tree(sent_conlltags) return ne_tree
def get_result_json(result_list): tokens = [result['word'] for result in result_list] tags = [result['tag'] for result in result_list] re_dict_json = defaultdict(int) result_json = defaultdict(list) re_dict_json = defaultdict(int) pos_tags = [pos for token, pos in pos_tag(tokens)] conlltags = [(token, pos, tg) for token, pos, tg in zip(tokens, pos_tags, tags)] ne_tree = conlltags2tree(conlltags) original_text = defaultdict(list) for subtree in ne_tree: original_string = [] if type(subtree) == Tree: original_label = subtree.label() leaves = subtree.leaves() else: leaves = [subtree] for token, pos in leaves: token = token.replace('##', '') re_dict_json[re_find_which_pattern(token)] += 1 original_string.extend([ (token, int(re_dict_json[re_find_which_pattern(token)])) ]) if original_string: try: original_text[original_label.lower()].append(original_string) except: pass return original_text
def read(self): self.iob_sents = [] self.sents = [] self.feature_set = [] self.train_set = [] self.test_set = [] self.iob_train = [] self.iob_test = [] for filename in glob(self.dirname): file_feature_set = [] file_iob_sents = [] with open(filename, 'rb') as f: tags_re = r'\[(.*?)\]' tags_sub = r'\[[A-Z]+\s|\]' for sentence in f: sentence = sentence.decode('utf8') text = re.sub(tags_sub, '', sentence).strip('\n').strip() self.sents.append(text) file_feature_set.append( (text, splitext(basename(filename))[0])) pos_tags = pos_tag(text)[0] tags = [] for tag in re.findall(tags_re, sentence): tag, value = tag.split(' ', 1) words = word_tokenize(value) first = [(words[0], self.pop_pos(pos_tags, words[0]), 'B-%s' % tag)] tags.append(first + [(w, self.pop_pos(pos_tags, w), 'I-%s' % tag) for w in words[1:]]) itags = iter(tags) text_list = re.sub(tags_re, '[NE]', sentence).split('[NE]') iob = [] for part in text_list: tagged_part = [(w, self.pop_pos(pos_tags, w), 'O') for w in word_tokenize(part)] try: ne = next(itags) iob += tagged_part + ne except StopIteration: iob += tagged_part file_iob_sents.append(iob) self.feature_set.extend(file_feature_set) self.iob_sents.extend(file_iob_sents) file_iob_train, file_iob_test = train_test_split( [[((w, p), i) for w, p, i in s] for s in file_iob_sents], test_size=self.test_size, random_state=self.random_state) file_train_set, file_test_set = train_test_split( file_feature_set, test_size=self.test_size, random_state=self.random_state) self.iob_train.extend(file_iob_train) self.iob_test.extend(file_iob_test) self.train_set.extend(file_train_set) self.test_set.extend(file_test_set) self.chunked_sents = [conlltags2tree(x) for x in self.iob_sents]
def nltk_ner(x, *args): from nltk import word_tokenize, pos_tag, ne_chunk from nltk.chunk import conlltags2tree, tree2conlltags ne_tree = ne_chunk(pos_tag(word_tokenize(x))) iob_tagged = tree2conlltags(ne_tree) ne_tree = conlltags2tree(iob_tagged) return ne_tree
def parse(self, sentence): pos_tags = [pos for word, pos in sentence] tagged_pos_tags = self.tagger.tag(pos_tags) conlltags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), (pos_tag, chunk_tag)) in zip(sentence, tagged_pos_tags)] return conlltags2tree(conlltags)
def __create_tree__(self, tokens, key): _input = self.__get_folia_doc__(tokens) __output = self._frog.process(_input) for token in __output: token['pos'] = token['pos'].split('(')[0] if token['pos'].startswith('SPEC'): token['pos'] = 'NNP' return conlltags2tree([(token['text'], token['pos'], token[key]) for token in __output ])
def load_iobtags(iobtags): if iobtags is None: return None try: iobtags = [make_tuple(i.strip()) if i.endswith(')') else make_tuple(i.strip()+")") for i in iobtags[1:-1].split("),")] return conlltags2tree(iobtags) except: return None
def stanford_ner_to_tree(text): bio_tagged = stanford_ner_to_bio(stanford_ner(text)) sentence_tokens, sentence_ne_tags = zip(*bio_tagged) sentence_pos_tags = [pos for token, pos in pos_tag(sentence_tokens)] sentence_conlltags = [(token, pos, ne) for token, pos, ne in zip(sentence_tokens, sentence_pos_tags, sentence_ne_tags)] return conlltags2tree(sentence_conlltags)
def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the normalized format of triplets [(w1, t1, iob1), ...] iob_triplets = [(word, token, chunk) for ((word, token), chunk) in chunks] # Transformthe list of triplets to NLTK tree format return conlltags2tree(iob_triplets)
def parse(self, sentence): tagged_sents = self.tagger.tag(sentence) tagged_sents = conlltags2tree(tagged_sents) # Nested chunked tags for CLAUSING # cp = nltk.RegexpParser(grammar) # tagged_sents = cp.parse(tagged_sents) return tagged_sents
def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets)
def get_chuncker_accuracy(chunker, test_samples): """ returns score of the chunker against the gold standard """ score = chunker.evaluate([ conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in test_samples ]) return score.accuracy()
def iob_tagged_ner(sentence): """ from nltk import word_tokenize, pos_tag, ne_chunk from nltk.chunk import conlltags2tree, tree2conlltags """ ne_tree = ne_chunk(pos_tag(word_tokenize(sentence))) iob_tagged = tree2conlltags(ne_tree) iob_tagged_ne_tree = conlltags2tree(iob_tagged) return iob_tagged_ne_tree
def get_tags(tokens): # tokenized = nltk.word_tokenize(text) tagged = nltk.pos_tag(tokens) namedEnt = nltk.ne_chunk(tagged) iob_tagged = tree2conlltags(namedEnt) ne_tree = conlltags2tree(iob_tagged) return ne_tree
def parse(self, tagged_sent, return_tree = True): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = revert_scheme([(w, t, c) for ((w, t), c) in chunks]) # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets) if return_tree else iob_triplets
def parse(self, tokens): _input = self.__get_folia_doc__(tokens) __output = self._frog.process(_input) for token in __output: token['pos'] = token['pos'].split('(')[0] if token['pos'].startswith('SPEC'): token['pos'] = 'NNP' if token['chunker'] != 'O' and token['ner'] == 'O': token['ner'] = token['chunker'] return conlltags2tree([(token['text'], token['pos'], token['ner']) for token in __output ])
def transform_stanford_name_entity_to_tree(ne_tagged_sent): ne_tree = [] if ne_tagged_sent: bio_tagged_sent = Helper.transform_stanford_name_entity_to_bio( ne_tagged_sent) sent_tokens, sent_ne_tags = zip(*bio_tagged_sent) sent_pos_tags = [pos for token, pos in nltk.pos_tag(sent_tokens)] sent_conlltags = [(token, pos, ne) for token, pos, ne in zip( sent_tokens, sent_pos_tags, sent_ne_tags)] ne_tree = conlltags2tree(sent_conlltags) return ne_tree
def make_ne_tree(self, tagged): bio_tagged_sent = self.stanford_reformat(tagged) sent_tokens, sent_ne_tags = zip(*bio_tagged_sent) sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)] sent_conlltags = [ (token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags) ] ne_tree = conlltags2tree(sent_conlltags) return ne_tree
def parse(self, tagged_sent): """This function is used by evaluate to make guesses and format the guesses """ #make gueess (tag) chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, p, t) for ((w, p), t) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets)
def parse(self, sentence): pos_tags = [pos for word, pos in sentence] # Get the Chunk tags tagged_pos_tags = self.tagger.tag(pos_tags) # Assemble the (word, pos, chunk) triplets conlltags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), (pos_tag, chunk_tag)) in zip(sentence, tagged_pos_tags)] # Transform to tree return conlltags2tree(conlltags)
def stanfordNE2tree( self, ne_tagged_sent ): # this function is a place holder for parsed tree output bio_tagged_sent = self.stanfordNE2BIO(ne_tagged_sent) sent_tokens, sent_ne_tags = zip(*bio_tagged_sent) sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)] sent_conlltags = [ (token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags) ] ne_tree = conlltags2tree(sent_conlltags) return ne_tree
def parse(self, orig_tokens): if orig_tokens and type(orig_tokens[0]) is tuple: tokens = [token for token, _ in orig_tokens] else: tokens = orig_tokens tokenized_ud = list( map(lambda x: (x[0], map_tag('ru-rnc', 'universal', x[1])), pos_tag(tokens, lang='rus'))) tokenized_nltk = pos_tag(tokens, lang='rus') tokenized_mystem = [(token, self.mystem_tagger.tag_word(token)[0][1]) for token in tokens] # print(self.chunker_iis.parse(tokenized_ud)) tags_nltk = self.chunker_nltk.parse(tokenized_nltk, return_tree=False) tags_ud = self.chunker_nltk.parse(tokenized_ud, return_tree=False) tags_mystem = self.chunker_nltk.parse(tokenized_mystem, return_tree=False) tags_iis = tree2conlltags(self.chunker_iis.parse(tokenized_ud)) tags_grammar = tree2conlltags( self.grammar_chunker.parse(tokenized_mystem)) result_tags = [tags_nltk, tags_ud, tags_mystem, tags_grammar, tags_iis] if tokens is orig_tokens: tag_source = tags_ud else: tag_source = orig_tokens tags = [(token, tag_source[ind][1], pick_tag([tags_sp[ind][2] for tags_sp in result_tags], tags_ud[ind][1])) for ind, token in enumerate(tokens)] # for ind, (token,pos,iob_tag) in enumerate(tags): # if token in set(['таких', 'такие', 'такими', 'как', 'включая', 'и', 'или','другие', 'других', 'другими', 'особенно', 'в', 'частности', ',']): # tags[ind] = (token, pos, 'O') for ind, (token, pos, iob_tag) in enumerate(tags): if ind == 0: continue if iob_tag == "B-NP*": if tags[ind - 1][2] in {'B-NP', 'I-NP'}: tags[ind] = (token, pos, 'I-NP') else: tags[ind] = (token, pos, 'B-NP') if iob_tag == "I-NP" and tags[ind - 1][2] not in {'B-NP', 'I-NP'}: tags[ind] = (token, pos, 'B-NP') return conlltags2tree(tags)
def __generate_tree(self, bio_tagged): """ Tranform a list of tags in a tree """ from nltk import pos_tag from nltk.chunk import conlltags2tree tokens, ne_tags = zip(*bio_tagged) pos_tags = [pos for token, pos in pos_tag(tokens)] conlltags = [(token, pos, ne) for token, pos, ne in zip(tokens, pos_tags, ne_tags)] ne_tree = conlltags2tree(conlltags) return ne_tree
def stanford_ner(words, args): start = time.time() """ 3 class: Location, Person, Organization 4 class: Location, Person, Organization, Misc 7 class: Location, Person, Organization, Money, Percent, Date, Time """ ner_classifier_path = 'english.all.3class.distsim.crf.ser.gz' # default 3 class if args.ner_class == 7: ner_classifier_path = 'english.muc.7class.distsim.crf.ser.gz' elif args.ner_class == 4: ner_classifier_path = 'english.conll.4class.distsim.crf.ser.gz' ner_classifier_full_path = os.path.join(stanford_ner_directory_path, 'classifiers', ner_classifier_path) ner_jar_path = os.path.join(stanford_ner_directory_path, 'stanford-ner.jar') s_ner_tagger = StanfordNERTagger(ner_classifier_full_path, ner_jar_path, encoding='UTF-8') _tagged = s_ner_tagger.tag(words) # NLP BIO tags processing (B-beginning NE, I-inside NE, O-outside NE) bio_tagged = [] prev_tag = "O" for token, tag in _tagged: if tag == "O": # O bio_tagged.append((token, tag)) prev_tag = tag continue if tag != "O" and prev_tag == "O": # Begin NE bio_tagged.append((token, "B-" + tag)) prev_tag = tag elif prev_tag != "O" and prev_tag == tag: # Inside NE bio_tagged.append((token, "I-" + tag)) prev_tag = tag elif prev_tag != "O" and prev_tag != tag: # Adjacent NE bio_tagged.append((token, "B-" + tag)) prev_tag = tag # convert bio_tags to NLTK tree-like format tokens, ne_tags = zip(*bio_tagged) pos_tags = [pos for token, pos in get_pos_tags(tokens, args)] conlltags = [(token, pos, ne) for token, pos, ne in zip(tokens, pos_tags, ne_tags)] ne_tree = conlltags2tree(conlltags) print 'Stanford NER took %.3f sec, NEs are:\n %s\n' % ( time.time() - start, structure_ne(ne_tree))
def evaluate_chunker(chunker, test_samples): accuracy = 0 with open(test_samples, 'rb') as fp: dataset = pickle.load(fp) for i in range(len(dataset)): score = chunker.evaluate([ conlltags2tree([(w, t, iob) for ((w, t), iob) in dataset[i]]) ]) accuracy = accuracy + score.accuracy() return accuracy / len(dataset)
def NER_nltk(sentence): from nltk import word_tokenize, pos_tag, ne_chunk from nltk.chunk import conlltags2tree, tree2conlltags ne_tree = ne_chunk(pos_tag(word_tokenize(sentence))) print(ne_tree) print("-----------------------------") iob_tagged = tree2conlltags(ne_tree) print(iob_tagged) print("-----------------------------") ne_tree = conlltags2tree(iob_tagged) print(ne_tree)
def convertIOBtag(tokens, tags): # tag each token with pos pos_tags = [pos for token, pos in pos_tag(tokens)] # convert the BIO / IOB tags to tree conlltags = [(token, pos, tg) for token, pos, tg in zip(tokens, pos_tags, tags)] ne_tree = conlltags2tree(conlltags) # parse the tree to get our original text original_text = [] for subtree in ne_tree: # checking for 'O' tags if type(subtree) == Tree: original_label = subtree.label() original_string = " ".join( [token for token, pos in subtree.leaves()]) original_text.append((original_string, original_label)) return original_text
def stanfordNE2tree(ne_tagged_sent): """ Function converts the Named Entity tagged sentence to a tree Parameters ---------- ne_tagged_sent : list Named entity tagged sentence by Standford NER tagger Returns ------- Tree NLTK tree structure of CoNLL IOB """ bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent) sent_tokens, sent_ne_tags = zip(*bio_tagged_sent) sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)] sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)] ne_tree = conlltags2tree(sent_conlltags) return ne_tree
def parse(self, sent): posTags = [posTag for (word, posTag) in sent] bioTags = [bioTag for (posTag, bioTag) in self.tagger.tag(posTags)] chunkedSent = [(word, posTag, bioTag) for ((word, posTag), bioTag) in zip(sent, bioTags)] return conlltags2tree(chunkedSent)
def tagged_parse_sents(self, sentences): return conlltags2tree(super(Chunker, self).tag_sents(sentences))
def parse(self, tagged_sent): if not tagged_sent: return None chunks = self.tagger.tag(tagged_sent) return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
def parse_sents(self, sentences): for conlltagged in super(Chunker, self).tag_sents(sentences): yield conlltags2tree(conlltagged)