def convert(self, parsed, stanford=False, ispos=False, csv=False, csv_line_id=0, iobtree=False): if csv: conll_format = 'Sentence: ' + str(csv_line_id) elif iobtree: conll_format = None else: conll_format = '' conll = [] for entity, value in parsed.items(): value_split = value.split() value_split_len = len(value_split) if value != '': if ispos: value_pos = pos_tag(value_split) # pos = part of speech conll_format += self.__get_conll_pos(value_pos, value_split_len, entity) elif csv: value_pos = pos_tag(value_split) conll_format += self.__get_csv(value_pos, value_split_len, entity) elif iobtree: value_pos = pos_tag(value_split) iob = self.__get_nltk_tree(value_pos, value_split_len, entity) conll = conll + iob else: conll_format += self.__get_conll_format(value_split, value_split_len, entity, stanford) if csv is False and iobtree is False: conll_format += '\n' if iobtree: conll_format = conlltags2tree(conll) return conll_format
def Ext_Chunks(sents): NP_li = [] # print(sents) grammar_exp = r""" CHUNK: {<NN><NN.*><NN.*>+} # chunk determiner/possessive, adjectives and noun }<NNP>+{ # chunk sequences of proper nouns """ # cp = nltk.RegexpParser('CHUNK: {<NN><NN.*><NN.*>+}}<NNP>{') cp = nltk.RegexpParser(grammar_exp) # cp = nltk.RegexpParser('CHUNK: {<DT>?<JJ.*>*<NN.*>+}') for sent in sents: tree = cp.parse(sent) # print(tree.draw()) for subtree in tree.subtrees(): if subtree.label() == 'CHUNK': print(subtree) iob_tags = tree2conlltags(subtree) iob_tree = conlltags2tree(iob_tags) print(iob_tags) print(iob_tree) chunk_words = str(subtree).replace('/DT', '').replace('/JJS', '').replace('/JJ', '').replace('/NNS', '').replace( '/NNP', '').replace('(CHUNK', '').replace(')', '').replace('/NN', '').replace('\n', '') NP_li.append(chunk_words) print(chunk_words, '\n') print('----------------------------------------------------------------\n',NP_li) return NP_li
def read_gmb_ner(corpus_root): for root, dirs, files in os.walk(corpus_root): for filename in files: if filename.endswith(".tags"): with open(os.path.join(root, filename), 'rb') as file_handle: file_content = file_handle.read().decode('utf-8').strip() annotated_sentences = file_content.split('\n\n') for annotated_sentence in annotated_sentences: annotated_tokens = [ seq for seq in annotated_sentence.split('\n') if seq ] standard_form_tokens = [] for idx, annotated_token in enumerate( annotated_tokens): annotations = annotated_token.split('\t') word, tag, ner = annotations[0], annotations[ 1], annotations[3] if ner != 'O': ner = ner.split('-')[0] standard_form_tokens.append((word, tag, ner)) conll_tokens = to_conll_iob(standard_form_tokens) yield conlltags2tree(conll_tokens)
def parse(self, tokens): """ Chunk a tagged sentence :param tokens: List of words [(w1, t1), (w2, t2), ...] :return: chunked sentence: nltk.Tree """ if isinstance(tokens, str): tokens = pos_tag(word_tokenize(tokens)) history = [] iob_tagged_tokens = [] for index, (word, tag) in enumerate(tokens): iob_tag = self._classifier.predict( [self._feature_detector(tokens, index, history)])[0] history.append(iob_tag) iob_tagged_tokens.append((word, tag, iob_tag)) results = conlltags2tree(iob_tagged_tokens) return_val = [] for i in results.subtrees(): if i.label() != 'S': word = '' for x in i.leaves(): word += x[0] + ' ' word = word.strip() return_val.append({"type": i.label(), "value": word}) return return_val
def read_gmb(corpus_root): for root, dirs, files in os.walk(corpus_root): for filename in files: if filename.endswith(".tags"): with open(os.path.join(root, filename), 'rb') as file_handle: # file_handle = zipfile.ZipFile('gmb-2.2.0.zip', 'r') file_content = file_handle.read().decode('utf-8').strip() annotated_sentences = file_content.split('\n\n') for annotated_sentence in annotated_sentences: annotated_tokens = [ seq for seq in annotated_sentence.split('\n') if seq ] standard_form_tokens = [] for idx, annotated_token in enumerate( annotated_tokens): annotations = annotated_token.split('\t') word, tag, ner = annotations[0], annotations[ 1], annotations[3] ner_tags[ner] += 1 # Get only the primary category if ner != 'O': ner = ner.split('-')[0] # if tag in ('LQU', 'RQU'): # tag = "``" standard_form_tokens.append((word, tag, ner)) conll_tokens = to_conll_iob(standard_form_tokens) yield conlltags2tree(conll_tokens) print("Data read done")
def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) iob_triplets = tagged_pairs2triplets(chunks) # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets)
def parse(self, tokens): history = [] iob_tagged_tokens = [] for index, (word, tag) in enumerate(tokens): iob_tag = self._classifier.predict([self._feature_detector(tokens, index, history)])[0] history.append(iob_tag) iob_tagged_tokens.append(word, tag, iob_tag) return conlltags2tree(iob_tagged_tokens)
def stanford_tree(bio_tagged): tokens, ne_tags = zip(*bio_tagged) pos_tags = [pos for token, pos in pos_tag(tokens)] conlltags = [(token, pos, ne) for token, pos, ne in zip(tokens, pos_tags, ne_tags)] ne_tree = conlltags2tree(conlltags) return ne_tree
def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets)
def ner_eval(chunker, test_samples): """ evaluate named entity recognition accuracy :return: """ score = chunker.evaluate([ nltk.conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in test_samples[:500] ]) return score.accuracy()
def read_gmb_ner(corpus_root, start_index=None, end_index=None): current_file = -1 for root, _, files in os.walk(corpus_root): for filename in files: # Skip other files if not filename.endswith(".tags"): continue current_file += 1 # Skip files until we get to the start_index if start_index is not None and current_file < start_index: continue # Stop reading after end_index if end_index is not None and current_file > end_index: return with open(os.path.join(root, filename), 'rb') as file_handle: # Read the entire file file_content = file_handle.read().decode('utf-8').strip() # Split into sentences annotated_sentences = file_content.split('\n\n') for annotated_sentence in annotated_sentences: # Split into annotated tokens rows = [ row for row in annotated_sentence.split('\n') if row ] ner_triplets = [] for row in rows: annotations = row.split('\t') word, tag, ner = annotations[0], annotations[ 1], annotations[3] # Get only the main tag if ner != 'O': ner = ner.split('-')[0] # Make these tags NLTK compatible if tag in ('LQU', 'RQU'): tag = "``" # Ignore the art,eve,nat tags because they are underrepresented if tag in ('art', 'eve', 'nat'): tag = 'O' ner_triplets.append((word, tag, ner)) iob_triplets = ner2conlliob(ner_triplets) # Yield a nltk.Tree yield conlltags2tree(iob_triplets) print("Total files=", current_file)
def stanfordNE2tree(ne_tagged_sent): bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent) sent_tokens, sent_ne_tags = zip(*bio_tagged_sent) sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)] sent_conlltags = [ (token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags) ] ne_tree = nltk.conlltags2tree(sent_conlltags) return ne_tree
def parse(self, tokens): """ Chunk a tagged sentence tokens = list of words [(w1,t1) ...] return: chunked sentence in IOB format """ history = [] iob_tagged_tokens = [] for index, (word, tag) in enumerate(tokens): iob_tag = self._classifier.predict( [self._feature_detector(tokens, index, history)])[0] history.append(iob_tag) iob_tagged_tokens.append((word, tag, iob_tag)) return conlltags2tree(iob_tagged_tokens)
def parse(self, tokens): """ Chunk a tagged sentence :param tokens: List of words [(w1, t1), (w2, t2), ...] :return: chunked sentence: nltk.Tree """ history = [] iob_tagged_tokens = [] for index, (word, tag) in enumerate(tokens): iob_tag = self._classifier.predict( [self._feature_detector(tokens, index, history)])[0] history.append(iob_tag) iob_tagged_tokens.append((word, tag, iob_tag)) return conlltags2tree(iob_tagged_tokens)
def predict_IOB_labels(s): # generate query features for sentence s query_pos_tags, query_features = process_user_query(s) predicted_labels = crf.predict([query_features])[0] # convert the predicted labels into standard (token, pos, label) format query_tag_list = [ (pos_tag[0], pos_tag[1], label) for pos_tag, label in list(zip(query_pos_tags, predicted_labels)) ] # convert into tree query_tree = conlltags2tree(query_tag_list) # traverse the tree and print labels of subtrees labels_dict = {} for n in query_tree: if isinstance(n, nltk.tree.Tree): label = n.label() leaves = ' '.join(i[0] for i in n.leaves()) labels_dict[label] = leaves return labels_dict
def mark_entities(tagged_sentence, entity_words, label): """ tagged_sentence: [('Word', 'Tag'), ...] entity_words: ['This', 'is', 'an', 'entity'] label: the entity type return a nltk.Tree instance with the entities wrapped in chunks """ iob_tagged = [(w, t, 'O') for w, t in tagged_sentence] words = nltk.untag(tagged_sentence) start_index = sub_list(words, entity_words) if start_index is not None: iob_tagged[start_index] = (iob_tagged[start_index][0], iob_tagged[start_index][1], 'B-' + label) for idx in range(1, len(entity_words)): iob_tagged[start_index + idx] = (iob_tagged[start_index + idx][0], iob_tagged[start_index + idx][1], 'I-' + label) return nltk.conlltags2tree(iob_tagged)
def convert_sentprocessed_to_tree(sent_processed): sent_tree = [] for sent in sent_processed: sent_tree.append(nltk.conlltags2tree(sent)) return sent_tree
def parse(self, sent): tagged_sents = self.tagger.tag(sent) iob_sents = [(w, t, c) for ((w, t), c) in tagged_sents] return (conlltags2tree(iob_sents))
# Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets) reader = read_gmb(corpus_root, 1000) data = list(reader) training_samples = data[:int(len(data) * 0.9)] test_samples = data[int(len(data) * 0.9):] print "#training samples = %s" % len( training_samples) # training samples = 55809 print "#test samples = %s" % len(test_samples) # test samples = 6201 chunker = NamedEntityChunker(training_samples[:5000]) ner = chunker.parse( pos_tag( word_tokenize( " Jobs was diagnosed " "with a pancreatic neuroendocrine " "tumor in 2003 and died on October " "5, 2011, of respiratory arrest related to the tumor. "))) #ner.draw() #flat_ner=ner.flatten() #print (flat_ner) #print (type(flat_ner)) score = chunker.evaluate([ conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in test_samples[:500] ]) print score
print(pos_tagger) grammar = "NP: {<DT>?<JJ>*<NN>}" cp = nltk.RegexpParser(grammar) result = cp.parse(pos_tagger) result = result.flatten() #print(result) iob_tags = tree2conlltags(pos_tagger) tree = conlltags2tree(iob_tags) #print(tree) ''' Information retreival using spacy ''' import spacy import en_core_web_sm from collections import Counter from spacy import displacy from collections import defaultdict from tabulate import tabulate nlp = spacy.load('en_core_web_sm') doc = nlp(u'ABILIFY is indicated for the treatment of schizophrenia in adults and in adolescents aged 15 years and older without any history of myocardial infractions.') displacy.serve(doc, style='dep')