def parse(self, question): pos_tags = [pos for (word,pos) in question] tagged_pos_tags = self.tagger.tag(pos_tags) chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags] conlltags = [(word,pos,chunktag) for ((word, pos), chunktag) in zip(question, chunktags)] print conlltags return conlltags2tree(conlltags)
def parse(self, sentence): pos_tags = [pos for (word, pos) in sentence] tagged_pos_tags = self.tagger.tag(pos_tags) chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags] conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in zip(sentence, chunktags)] return conlltags2tree(conlltags)
def update(self,data): self.chunks={} try: feature=data chunks=feature['chunked'] tree=conlltags2tree(chunks) for chunk_name in self.target_chunks: succedded_chunk=self.getChunk(tree,chunk_name) if succedded_chunk: if chunk_name not in self.chunks: self.chunks[chunk_name]=succedded_chunk if "LOCATION" in str(self.chunks): print "-------------------------------------------------------------------------------------------------------------------------" print colored("\n[TWEET ORIGINAL]",'yellow') print feature['original'].encode('UTF-8') for key in self.chunks: print colored('[<<FRASES EXTRAÍDAS>>]:','blue') msg="<<Frase: "+key +" >>" print colored(msg,'green') self.iprint(self.chunks[key],key) print " " # frena un cachito para ver los resultados. sleep(4.0) except Exception,e: pass
def parse(self, tagged_sent): '''Parsed tagged tokens into parse Tree of chunks''' if not tagged_sent: return None (words, tags) = zip(*tagged_sent) chunks = self.tagger.tag(tags) # create conll str for tree parsing return conlltags2tree([(w,t,c) for (w,(t,c)) in zip(words, chunks)])
def parse(self, sentence): pos_tags = [pos for (word,pos) in sentence] tagged_pos_tags = self.tagger.tag(pos_tags) chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags] conlltags = [(word, pos, chunktag) for ((word,pos),chunktag) in zip(sentence, chunktags)] return conlltags2tree(conlltags)
def parse(self, sentence): #classify chunks for list of word-tags chunked_sents = self.tagger.tag(sentence) #convert to tree return conlltags2tree([(word, tag, chunk) for ((word, tag), chunk) in chunked_sents])
def parse(self, sentence): # [_code-unigram-chunker-parse] pos_tags = [pos for (word, pos) in sentence] tagged_pos_tags = self.tagger.tag(pos_tags) chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags] conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in zip(sentence, chunktags)] #print "input to conlltags", conlltags return conlltags2tree(conlltags)
def parse(self, tokens): """ Parse sentence into chunks """ if not tokens: return None chunked = self.tagger.tag(tokens) return conlltags2tree([(w, t, c) for ((w, t), c) in chunked])
def parse(self, tagged_sent): '''Parsed tagged tokens into parse Tree of chunks''' if not tagged_sent: return None (words, tags) = zip(*tagged_sent) chunks = self.tagger.tag(tags) # create conll str for tree parsing wtc = zip(words, chunks) return conlltags2tree([(w,t,c) for (w,(t,c)) in wtc])
def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets)
def parse(self, tagged_sent): """Parsed tagged tokens into parse Tree of chunks""" if not tagged_sent: return None (words, tags) = zip(*tagged_sent) chunks = self.tagger.tag(tags) # create conll str for tree parsing wtc = itertools.izip(words, chunks) return conlltags2tree([(w, t, c) for (w, (t, c)) in wtc])
def parse(self, tagged_sent): # don't import at top since don't want to fail if not installed from pattern.en import parse s = ' '.join([word for word, tag in tagged_sent]) # not tokenizing ensures that the number of tagged tokens returned is # the same as the number of input tokens sents = parse(s, tokenize=False).split() if not sents: return None return conlltags2tree([(w, t, c) for w, t, c, p in sents[0]])
def parse(self, sentence): tokenized = nltk.pos_tag(nltk.word_tokenize(sentence)) pos_tags = [pos for (_, pos) in tokenized] tagged_pos_tags = self.tagger.tag(pos_tags) chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags] conlltags = [ (word, pos, chunktag) for ((word, pos), chunktag) in zip(tokenized, chunktags) ] return conlltags2tree(conlltags)
def parse(self, tagged_sentence): if not tagged_sentence: return None pos_tags = [tag for word, tag in tagged_sentence] chunk_pos_tags = self.chunk_tagger.tag(pos_tags) chunk_tags = [chunk_tag for (pos_tag, chunk_tag) in chunk_pos_tags] wpc_tags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), chunk_tag) in zip(tagged_sentence, chunk_tags)] return conlltags2tree(wpc_tags)
def parse(self, tokens): """ Parse sentence to chunks """ if not tokens: return None (words, tags) = zip(*tokens) gen_chunks = self.tagger.tag(tags) wtc = zip(words, gen_chunks) return conlltags2tree([(w, t, c) for (w, (t, c)) in wtc])
def parse(self, sentence): tagged_sents = self.tagger.tag(sentence) conlltags = [(w,t,c) for ((w,t),c) in tagged_sents] # returns the tags # return conlltags # returns the tuple response_dict=dict() response_dict["tags"]=conlltags response_dict["tree"]=conlltags2tree(conlltags) #return str(conlltags)+"_SEPARATOR_"+str(conlltags2tree(conlltags)) #dump=json.dumps(response_dict) return response_dict
def parse(self, tagged_sent): iobs = [] in_person = False for word, tag in tagged_sent: if word in self.name_set and in_person: iobs.append((word, tag, 'I-PERSON')) elif word in self.name_set: iobs.append((word, tag, 'B-PERSON')) in_person = True else: iobs.append((word, tag, 'O')) in_person = False return conlltags2tree(iobs)
def parse(self, tagged_sentence): if not tagged_sentence: return None # Separate out POS tags from the sentence pos_tags = [tag for word, tag in tagged_sentence] # Use the chunk tagger to get IOB Tags (Chunk Tags) for the sentence, using the POS Tags of its words as input chunk_pos_tags = self.chunk_tagger.tag(pos_tags) chunk_tags = [chunk_tag for (pos_tag, chunk_tag) in chunk_pos_tags] # Combine the obtained IOB Tags with the Words and POS Tags to form WTC triples wpc_tags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), chunk_tag) in zip(tagged_sentence, chunk_tags)] # Build the shallow parse tree from the WTC triples of the sentence return conlltags2tree(wpc_tags)
def update(self,data): self.chunks={} try: feature=data chunks=feature['chunked'] tree=conlltags2tree(chunks) for chunk_type in self.target_chunks: succedded_chunk=self.getChunk(tree,chunk_type) if succedded_chunk: if chunk_type not in self.chunks: self.chunks[chunk_type]=succedded_chunk if self.to_show in str(self.chunks): print "-------------------------------------------------------------------------------------------------------------------------" self.pretty_print(feature,self.chunks) sleep(self.sleep_time) except Exception,e: print str(e) pass
def cross_val(tag_file): """Perform leave one out cross validation for a NER chunker given a tag file""" print("\nRunning cross validation score on data set from " + tag_file + ":") reader = read_gmb(tag_file) data = list(reader) random.shuffle(data) acc = 0 script_cor = 0 for i in range(len(data)): test_sample = data[i] training_samples = data[:] del training_samples[i] chunker = NamedEntityChunker(training_samples) score = chunker.evaluate( [conlltags2tree([(w, t, iob) for (w, t), iob in test_sample])]) acc += score._tags_correct / score._tags_total if score._tags_correct == score._tags_total: script_cor += 1 print("Overall tagging accuracy: {0:.2f}%".format(acc / len(data) * 100)) print("Percentage of scripts correct: {0:.2f}%".format(script_cor / len(data) * 100)) return
def parse(self, text, conlltags=True): """ Given a text, applies tokenization, part of speech tagging and the gazetteer words with their tags. Returns an conll tree. :param text: The text to parse :type text: str :param conlltags: :type conlltags: :return: An conll tree :rtype: """ # apply the regular expressions and find all the # gazetteer words in text for prog, tag in self.progs: words_found = set(prog.findall(text)) # keep the unique words if len(words_found) > 0: for word in words_found: # words_found may be more than one self.words.append(word) # keep the words self.iobtags.append(tag) # and their tag # find the pattern with the maximum words. # this will be the look ahead variable for word in self.words: # don't care about tags now nwords = word.count(' ') if nwords > self.lookahead: self.lookahead = nwords # tokenize and apply part of speech tagging tagged_sent = self.pos_tag(self.tokenize(text)) # find the iob tags iobs = self.iob_tags(tagged_sent) if conlltags: return conlltags2tree(iobs) else: return iobs
def parse(self, sentence): tagged_sents = self.tagger.tag(sentence) conlltags = [(w,t,c) for ((w,t),c) in tagged_sents] return conlltags2tree(conlltags)
def parse(self, tagged_sent): if not tagged_sent: return None chunks = self.tagger.tag(tagged_sent) return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
def ieer_chunked_sents(tag=nltk.tag.pos_tag): for doc in ieer.parsed_docs(): tagged = ieertree2conlltags(doc.text, tag) yield conlltags2tree(tagged)
def parse(self, tagged_sent): iobs = self.iob_locations(tagged_sent) return conlltags2tree(iobs)
def parse(self, tagged_sent): if not tagged_sent: return None (words, tags) = zip(*tagged_sent) chunks = self.tagger.tag(tags) wtc = zip(words, chunks) return conlltags2tree([(w, t, c) for (w, (t, c)) in wtc])
print rc.evaluate(test_data) from nltk.chunk.util import tree2conlltags, conlltags2tree train_sent = train_data[7] print train_sent wtc = tree2conlltags(train_sent) wtc tree = conlltags2tree(wtc) print tree def conll_tag_chunks(chunk_sents): tagged_sents = [tree2conlltags(tree) for tree in chunk_sents] return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents] def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff from nltk.tag import UnigramTagger, BigramTagger from nltk.chunk import ChunkParserI
def parse(self, tagged_sent): # This function parse sentence and identifies locations iobs = self.iob_locations(tagged_sent) return conlltags2tree(iobs)
if line.isspace(): if not iob_pos_tweet: #print(prev_line) #print(counter) print(prev_tokens) continue raise ValueError('tweet empty') tweets.append(iob_pos_tweet) iob_pos_tweet = [] else: line = line.strip() tokens = line.split() if not tokens: raise ValueError('tokens empty') iob_pos_tweet.append( ((tokens[0], tokens[1]), tokens[2]) ) prev_tokens = tokens prev_line = line counter = counter + 1 return tweets #research_project/nltk/ training_data = read_data('train.txt') chunker = NamedEntityChunker(training_data) test_data = read_data('test.txt') score = chunker.evaluate([conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in test_data]) print('precision: ', score.precision()) print('recall: ', score.recall()) print('f1: ', score.f_measure()) #tweets = [pos_tag(t) for t in tweets] #sent = nltk.corpus.treebank.tagged_sents()[22] #tweets = [nltk.ne_chunk(t) for t in tweets]
''' Created on Jul 20, 2015 @author: dongx ''' import nltk from nltk.corpus.reader import ConllChunkCorpusReader from nltk.chunk.util import tree2conlltags, conlltags2tree from nltk.tree import Tree from nltk.corpus import treebank from nltk.corpus import conll2000 iob = tree2conlltags(Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])])) tree = conlltags2tree([('the', 'DT', 'B-NP'), ('book', 'NN', 'I-NP')]) print("--------convertion between iob and tree---------------------") print(iob) print(tree)