Python tree2conlltagsの例、nltk.chunk.util.tree2conlltags Pythonの例

コード例 #1

0

ファイルを表示

ファイル: CNC.py プロジェクト: patricksttan/Iamthinking

 def __init__(self, depth, train_sents):
     global maxDepth
     maxDepth = depth
     self.dicTree = DefDict(1)
     for s in train_sents:
         for (w,p,c) in tree2conlltags(s):
             if c == "O":
                 self.train(w)

コード例 #2

0

ファイルを表示

ファイル: Printer.py プロジェクト: buhtigexa/Nerit

	def getChunk(self,tree,target_token):

		target=[]
		for subtree in tree.subtrees(filter=lambda t:target_token.lower() in t.label().lower()):
			word,post,iob_chunk=zip(*tree2conlltags(subtree))
			word=self.toString(word)
			target.append(word)
		return target

コード例 #3

0

ファイルを表示

ファイル: Chunker.py プロジェクト: danjamker/N-Fly

 def __init__(self, POS):
     '''      
     @param POS: the POS tagger is passed through 
     '''
     train_sents = conll2000.chunked_sents()
     train_data = [[(t, c) for w, t, c in tree2conlltags(sent)]
                   for sent in train_sents]
     self.T = nltk.TrigramTagger(train_data)
     self.Tagger = POS
     self.tmp = []

コード例 #4

0

ファイルを表示

ファイル: chunkers.py プロジェクト: ShunyuanZ/nltk3-cookbook

def conll_tag_chunks(chunk_sents):
	'''Convert each chunked sentence to list of (tag, chunk_tag) tuples,
	so the final result is a list of lists of (tag, chunk_tag) tuples.
	>>> from nltk.tree import Tree
	>>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])])
	>>> conll_tag_chunks([t])
	[[('DT', 'B-NP'), ('NN', 'I-NP')]]
	'''
	tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
	return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]

コード例 #5

0

ファイルを表示

ファイル: makeModels.py プロジェクト: tccorcoran/intent-recognition

def conll_tag_chunks(chunk_sents):
	'''Convert each chunked sentence to list of (tag, chunk_tag) tuples,
	so the final result is a list of lists of (tag, chunk_tag) tuples.
	>>> from nltk.tree import Tree
	>>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])])
	>>> conll_tag_chunks([t])
	[[('DT', 'B-NP'), ('NN', 'I-NP')]]
	Source: https://github.com/japerk/nltk-trainer/blob/master/nltk_trainer/chunking/chunkers.py
	'''
	tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
	return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]

コード例 #6

0

ファイルを表示

ファイル: ne_chunk.py プロジェクト: juchiyama/bigdata_fall2015

	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		tagger = ngrams.make_backoff_tagger()
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			sentences = pos.tokenize_sents(doc["cleansed_text"])
			tags = pos.tokenize_words(sentences)
			for sent in tags:
				tagged_sent = tagger.tag(sent)
				d = ne_chunk(tagged_sent)
				chunks = tree2conlltags(d)
				print(chunks)
			if ind == 10:
				break

コード例 #7

0

ファイルを表示

ファイル: Chunker.py プロジェクト: julius-jsr/text_sum

 def __init__(self, train_sents):
     tag_sents = [tree2conlltags(sent)for sent in train_sents]
     train_chunks = [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]
     train_set = []
     for tagged_sent in train_chunks:
         #print tagged_sent
         untagged_sent = nltk.tag.untag(tagged_sent)
         history = []
         for i, (word, tag) in enumerate(tagged_sent):
             
             featureset = self.featx(untagged_sent, i, history)
             #print featureset,tag
             train_set.append( (featureset, tag) )
             history.append(tag)
     self.classifier = nltk.naivebayes.NaiveBayesClassifier.train(train_set)

コード例 #8

0

ファイルを表示

ファイル: conll_get.py プロジェクト: juchiyama/bigdata_fall2015

 def test_interactive(self):
     docs = self.source.find_clean(batch_size=1000)
     tagger = ngrams.make_backoff_tagger()
     print()
     for ind, doc in docs:
         sentences = pos.tokenize_sents(doc["cleansed_text"])
         tags = pos.tokenize_words(sentences)
         for sent in tags:
             tagged_sent = tagger.tag(sent)
             d = ne_chunk(tagged_sent)
             chunks = tree2conlltags(d)
             print("CHUNKS" + str(chunks))
             print("NE" + str(cnll.get_ne(chunks)))
             print("NOUNS" + str(cnll.get_nouns(chunks)))
         if ind == 10:
             break

コード例 #9

0

ファイルを表示

ファイル: PostPatternStrategy.py プロジェクト: buhtigexa/Nerit

	def fix(self, feature):
		
		cleanSentence=feature
		tree=None
		try:
			
			grammar_pattern_to_clean=r'_.*' # caracter de separacion de niveles dentro de un mismo token.
			clean_pattern=''
			modified_chunk_pattern=r'.*_'
			words,post,iobs=zip(*feature)
			wiobs=tuple(w+"_"+iob for w,iob in zip(words,iobs)) # las sentencias a parsear ahora no consideran el POS TAG, sino IOBS y palabras.
			sentence=zip(words,wiobs)
			tree=self.postChunker.parse(sentence)
		  	loc_tags=tree2conlltags(flatten_deeptree(tree)) # voy de arbol a lista de tuplas de nuevo.
			cleanSentence=cleanIobs(words,post,loc_tags,grammar_pattern_to_clean,modified_chunk_pattern,clean_pattern)
	  		

		except Exception,e:
			pass

コード例 #10

0

ファイルを表示

ファイル: annotate.py プロジェクト: juchiyama/bigdata_fall2015

def clean_dict(doc, tagger=nltk.pos_tag):
    """ Processes NLP features from cleansed_text. All other functions
	wrap this one. 
	Serves to act as the NLP-front end for reddit corpus
	parsing. Dictionaries and json strings are accepted and return
	dictionaries containing additional information. The processing
	done here represents the general annotations. The following
	are the new fields added to the dictionary. Classifiers
	will work to modify or wrap these methods. 

	::

		{
			conlltags 		: [[(word, pos, BIO)]],
			nouns 			: [word],
			named_entities 		: [[word, pos, BIO]],
			cleansed_text 		: [[word]]
		}

	:param doc: dictionary of reddit corpus.
	:type doc: dict

	:param tagger: A pos tagger. 
	:type tagger: Tagger

	:returns: dict
	"""

    if "_id" in doc:
        del (doc["_id"])
    sentences = pos.tokenize_sents(doc["cleansed_text"])
    tags = pos.tokenize_words(sentences) or []
    doc["conlltags"] = []
    doc["nouns"] = []
    doc["named_entities"] = []
    for sent in tags:
        tagged_sent = nltk.pos_tag(sent) or []
        d = ne_chunk(tagged_sent) or []
        chunks = tree2conlltags(d)
        doc["conlltags"].append(chunks)
        doc["nouns"].extend(cnll.get_nouns(chunks))
        doc["named_entities"].extend(cnll.get_ne(chunks))
    return doc

コード例 #11

0

ファイルを表示

ファイル: npextractor.py プロジェクト: soerensigfusson/collective.classification

 def extract(self,text):
     """
     """
     tokens = self.tokenizer.tokenize(text)
     tagged_terms = self.tagger.tag(tokens)
     terms = {}
     np_terms = {}
     
     noun_phrases = [
         node
         for node in self.np_finder.parse(tagged_terms)
         if not isinstance(node,tuple)]
     
     for node in noun_phrases:
         coll_tag = tree2conlltags(node)
         if len(coll_tag) > 1:
             mterm = [
                 term.lower()
                 for (term,tag,temp) in coll_tag
                 if len(term)>1
                 ]
             
             mterm = ' '.join(mterm)
             self._add(mterm,np_terms)
         for (term,tag,temp) in coll_tag:
             if tag.startswith('N') and len(term)>1:
                 if tag in ['NNS','NNPS']:
                     term = singularize(term)
                 self._add(term.lower(),terms)
     
     for term in terms.keys():
         if not self.filter(term,terms[term]):
             del terms[term]
     
     for term in np_terms.keys():
         if not self.filter(term,np_terms[term]):
             del np_terms[term]
     
     return (terms,np_terms)

コード例 #12

0

ファイルを表示

ファイル: npextractor.py プロジェクト: avoinea/collective.classification

 def extract(self, text, locale='en'):
     """
     """
     tokenizer = queryUtility(ITokenizer, name=locale)
     tagger = queryUtility(IPOSTagger, name=locale)
     if not tagger or not tokenizer:
         #Non-supported language
         return
     tokens = tokenizer.tokenize(text)
     tagged_terms = tagger.tag(tokens)
     terms = {}
     np_terms = {}
     noun_phrases = [
         node
         for node in tagger.np_grammar.parse(tagged_terms)
         if not isinstance(node, tuple)]
     for node in noun_phrases:
         coll_tag = tree2conlltags(node)
         if len(coll_tag) > 1:
             mterm = [
                 term.lower()
                 for (term, tag, temp) in coll_tag
                 if len(term)>1]
             mterm = ' '.join(mterm)
             if mterm:
                 self._add(mterm, np_terms)
         for (term, tag, temp) in coll_tag:
             if tag.startswith('N') and len(term)>1:
                 term = tagger.normalize(term, tag)
                 self._add(term.lower(), terms)
     for term in terms.keys():
         if not self.filter(term, terms[term]):
             del terms[term]
     for term in np_terms.keys():
         if not self.filter(term, np_terms[term]):
             del np_terms[term]
     return (terms, np_terms)

コード例 #13

0

ファイルを表示

ファイル: chunker.py プロジェクト: marlanbar/academic-projects

def main():

  wsjsubset = open("../corpus/wsjsubset", 'r').readlines()
  genia = open("../corpus/genia", 'r').readlines()
  txt_esp1 = open("../corpus/espanol1", 'r').readlines() 
  txt_esp2 = open("../corpus/espanol2", 'r').readlines() 

  words = []
  postag = []
  chunktag = []
  for line in txt_esp1:
    if len(line.split()) > 0:
      words.append(line.split()[0])
      postag.append(line.split()[1])
      chunktag.append(line.split()[2])
  
  postag_nltk = pos_tag(words) 
  chunktag_nltk = tree2conlltags(chunker.parse(postag_nltk))
  print chunktag_nltk
  cant_nominales_nltk = 0
  cant_nominales_gold = 0
  cant_nominales_hit = 0

  for i in xrange(len(chunktag_nltk)):
    if chunktag_nltk[i][2] in ['I-NP', 'B-NP']:
      cant_nominales_nltk += 1
      if chunktag_nltk[i][2] == chunktag[i]:
        cant_nominales_hit += 1
    if chunktag[i] in ['I-NP', 'B-NP']:
      cant_nominales_gold += 1

  precision = cant_nominales_hit / float(cant_nominales_nltk)
  recall = cant_nominales_hit / float(cant_nominales_gold)


  print "Precision: ", precision
  print "Recall: ", recall

コード例 #14

0

ファイルを表示

ファイル: custom_chunker.py プロジェクト: patricksttan/Iamthinking

 def __init__(self, featuremap, train_sents):
     tagged_sents = [[((w,t),c) for (w,t,c) in
                      tree2conlltags(sent)]
                     for sent in train_sents]
     self.tagger = _ConsecutiveNPChunkTagger(featuremap, tagged_sents)

コード例 #15

0

ファイルを表示

def conll_tag_chunks(chunk_sents):
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]

コード例 #16

0

ファイルを表示

ファイル: chunkers.py プロジェクト: ShunyuanZ/nltk3-cookbook

def chunk_trees2train_chunks(chunk_sents):
	tag_sents = [tree2conlltags(sent) for sent in chunk_sents]
	return [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]

コード例 #17

0

ファイルを表示

ファイル: shallow_parsing.py プロジェクト: 000Nelson000/text-analytics-with-python

def conll_tag_chunks(chunk_sents):
  tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
  return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]

コード例 #18

0

ファイルを表示

ファイル: shallow_parsing.py プロジェクト: 000Nelson000/text-analytics-with-python

c = rc.parse(tagged_sentence)

print c

print rc.evaluate(test_data)


   


from nltk.chunk.util import tree2conlltags, conlltags2tree

train_sent = train_data[7]
print train_sent

wtc = tree2conlltags(train_sent)
wtc

tree = conlltags2tree(wtc)
print tree
    

def conll_tag_chunks(chunk_sents):
  tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
  return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
  
def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

コード例 #19

0

ファイルを表示

ファイル: IOBtagging.py プロジェクト: dxr1988/NLTK-Research

'''
Created on Jul 20, 2015

@author: dongx
'''
import nltk
from nltk.corpus.reader import ConllChunkCorpusReader
from nltk.chunk.util import tree2conlltags, conlltags2tree
from nltk.tree import Tree
from nltk.corpus import treebank
from nltk.corpus import conll2000

iob = tree2conlltags(Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])]))
tree = conlltags2tree([('the', 'DT', 'B-NP'), ('book', 'NN', 'I-NP')])

print("--------convertion between iob and tree---------------------")
print(iob)
print(tree)

コード例 #20

0

ファイルを表示

ファイル: chunking.py プロジェクト: Abrothguer/NLTK-Experiments

def chunk_trees2train_chunks(chunk_sents):
    """
        Convert tuples (word, pos, iob) ot ((word, pos), iob)
    """
    tag_sents = [tree2conlltags(sent) for sent in chunk_sents]
    return [[((w, t), c) for (w, t, c) in sent] for sent in tag_sents]

コード例 #21

0

ファイルを表示

ファイル: annotated_corpus.py プロジェクト: juchiyama/bigdata_fall2015

from nltk.chunk.util import tree2conlltags
import rdt.nlp.conll_get as cnll

if __name__ == "__main__":
    source = rdtcorp.Source(conf_key="source_test")
    annotated = rdtcorp.Source(conf_key="annotated_test")
    docs = source.find()
    docs.batch_size(1000)
    tagger = ngrams.make_backoff_tagger()
    buf = []
    for ind, doc in enumerate(clean.doc_iter(docs)):
        del (doc["_id"])
        sentences = pos.tokenize_sents(doc["cleansed_text"])
        tags = pos.tokenize_words(sentences)
        doc["conlltags"] = []
        doc["nouns"] = []
        doc["named_entities"] = []
        for sent in tags:
            tagged_sent = tagger.tag(sent)
            d = ne_chunk(tagged_sent)
            chunks = tree2conlltags(d)
            doc["conlltags"].append(chunks)
            doc["nouns"].extend(cnll.get_nouns(chunks))
            doc["named_entities"].extend(cnll.get_ne(chunks))
        buf.append(doc)
        if ind % 1000:
            annotated.insert(buf)
            buf = []
    if buf:
        annotated.insert(buf)

コード例 #22

0

ファイルを表示

ファイル: chunking.py プロジェクト: Abrothguer/NLTK-Experiments

def conll_tag_chunks(chunk_sents):
    """
        Extracts a list of tuples (pos, iob) from a list of trees.
    """
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]

コード例 #23

0

ファイルを表示

def chunk_trees2train_chunks(chunk_sents):
    tag_sents = [tree2conlltags(sent) for sent in chunk_sents]
    return [[((w, t), c) for (w, t, c) in sent] for sent in tag_sents]