Python get_ne Examples

Programming Language: Python

Namespace/Package Name: rdt.nlp.conll_get

Method/Function: get_ne

Examples at hotexamples.com: 3

Python get_ne - 3 examples found. These are the top rated real world Python examples of rdt.nlp.conll_get.get_ne extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: conll_get.py Project: juchiyama/bigdata_fall2015

 def test_interactive(self):
     docs = self.source.find_clean(batch_size=1000)
     tagger = ngrams.make_backoff_tagger()
     print()
     for ind, doc in docs:
         sentences = pos.tokenize_sents(doc["cleansed_text"])
         tags = pos.tokenize_words(sentences)
         for sent in tags:
             tagged_sent = tagger.tag(sent)
             d = ne_chunk(tagged_sent)
             chunks = tree2conlltags(d)
             print("CHUNKS" + str(chunks))
             print("NE" + str(cnll.get_ne(chunks)))
             print("NOUNS" + str(cnll.get_nouns(chunks)))
         if ind == 10:
             break

Example #2

Show file

File: annotate.py Project: juchiyama/bigdata_fall2015

def clean_dict(doc, tagger=nltk.pos_tag):
    """ Processes NLP features from cleansed_text. All other functions
	wrap this one. 
	Serves to act as the NLP-front end for reddit corpus
	parsing. Dictionaries and json strings are accepted and return
	dictionaries containing additional information. The processing
	done here represents the general annotations. The following
	are the new fields added to the dictionary. Classifiers
	will work to modify or wrap these methods. 

	::

		{
			conlltags 		: [[(word, pos, BIO)]],
			nouns 			: [word],
			named_entities 		: [[word, pos, BIO]],
			cleansed_text 		: [[word]]
		}

	:param doc: dictionary of reddit corpus.
	:type doc: dict

	:param tagger: A pos tagger. 
	:type tagger: Tagger

	:returns: dict
	"""

    if "_id" in doc:
        del (doc["_id"])
    sentences = pos.tokenize_sents(doc["cleansed_text"])
    tags = pos.tokenize_words(sentences) or []
    doc["conlltags"] = []
    doc["nouns"] = []
    doc["named_entities"] = []
    for sent in tags:
        tagged_sent = nltk.pos_tag(sent) or []
        d = ne_chunk(tagged_sent) or []
        chunks = tree2conlltags(d)
        doc["conlltags"].append(chunks)
        doc["nouns"].extend(cnll.get_nouns(chunks))
        doc["named_entities"].extend(cnll.get_ne(chunks))
    return doc

Example #3

Show file

File: annotated_corpus.py Project: juchiyama/bigdata_fall2015

from nltk.chunk.util import tree2conlltags
import rdt.nlp.conll_get as cnll

if __name__ == "__main__":
    source = rdtcorp.Source(conf_key="source_test")
    annotated = rdtcorp.Source(conf_key="annotated_test")
    docs = source.find()
    docs.batch_size(1000)
    tagger = ngrams.make_backoff_tagger()
    buf = []
    for ind, doc in enumerate(clean.doc_iter(docs)):
        del (doc["_id"])
        sentences = pos.tokenize_sents(doc["cleansed_text"])
        tags = pos.tokenize_words(sentences)
        doc["conlltags"] = []
        doc["nouns"] = []
        doc["named_entities"] = []
        for sent in tags:
            tagged_sent = tagger.tag(sent)
            d = ne_chunk(tagged_sent)
            chunks = tree2conlltags(d)
            doc["conlltags"].append(chunks)
            doc["nouns"].extend(cnll.get_nouns(chunks))
            doc["named_entities"].extend(cnll.get_ne(chunks))
        buf.append(doc)
        if ind % 1000:
            annotated.insert(buf)
            buf = []
    if buf:
        annotated.insert(buf)