Esempi in Python per tokenize_sents

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: rdt.nlp.pos

Metodo/funzione: tokenize_sents

Esempi su hotexamples.com: 7

tokenize_sents in Python: 7 esempi trovati. Questi sono i migliori esempi reali in Python per rdt.nlp.pos.tokenize_sents, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: backoff.py Progetto: juchiyama/bigdata_fall2015

	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		tagger = ngrams.make_backoff_tagger()
		chain = lambda x : list(itertools.chain(*pos.tokenize_words(pos.tokenize_sents(x))))
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			print(tagger.tag(chain(doc["cleansed_text"])))
			if ind == 10:
				break

Esempio n. 2

Mostra file

File: ngrams.py Progetto: juchiyama/bigdata_fall2015

def collocationFinder(document,nbest=4):
	""" The is a bigram collocation finder. 
	:param document:
	"""
	chain = lambda x : list(itertools.chain(*pos.tokenize_words(pos.tokenize_sents(x))))
	stopset = set(stopwords.words('english'))
	filter_stops = lambda w: len(w) < 3 or w in stopset
	bcf = BigramCollocationFinder.from_words(chain(document))
	bcf.apply_word_filter(filter_stops)
	return bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)

Esempio n. 3

Mostra file

File: ne_chunk.py Progetto: juchiyama/bigdata_fall2015

	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		tagger = ngrams.make_backoff_tagger()
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			sentences = pos.tokenize_sents(doc["cleansed_text"])
			tags = pos.tokenize_words(sentences)
			for sent in tags:
				tagged_sent = tagger.tag(sent)
				d = ne_chunk(tagged_sent)
				chunks = tree2conlltags(d)
				print(chunks)
			if ind == 10:
				break

Esempio n. 4

Mostra file

File: classifier.py Progetto: juchiyama/bigdata_fall2015

def filter_words(text):
	"""Prepares the reddit document for bag of words. Turns the text
	into an array of strings, without stopwords
	:param text: blob of text
	:type text: [str]
	"""
	sents = pos.tokenize_words(pos.tokenize_sents(text))
	final = []
	"""turn the list of sentences into a list of words"""
	for sent in sents:
		final.extend(sent)
	stop = stopwords.words('english')
	final = [w for w in final if w.lower() not in stop]
	final = [w.lower() for w in final]
	return final

Esempio n. 5

Mostra file

File: conll_get.py Progetto: juchiyama/bigdata_fall2015

 def test_interactive(self):
     docs = self.source.find_clean(batch_size=1000)
     tagger = ngrams.make_backoff_tagger()
     print()
     for ind, doc in docs:
         sentences = pos.tokenize_sents(doc["cleansed_text"])
         tags = pos.tokenize_words(sentences)
         for sent in tags:
             tagged_sent = tagger.tag(sent)
             d = ne_chunk(tagged_sent)
             chunks = tree2conlltags(d)
             print("CHUNKS" + str(chunks))
             print("NE" + str(cnll.get_ne(chunks)))
             print("NOUNS" + str(cnll.get_nouns(chunks)))
         if ind == 10:
             break

Esempio n. 6

Mostra file

File: annotate.py Progetto: juchiyama/bigdata_fall2015

def clean_dict(doc, tagger=nltk.pos_tag):
    """ Processes NLP features from cleansed_text. All other functions
	wrap this one. 
	Serves to act as the NLP-front end for reddit corpus
	parsing. Dictionaries and json strings are accepted and return
	dictionaries containing additional information. The processing
	done here represents the general annotations. The following
	are the new fields added to the dictionary. Classifiers
	will work to modify or wrap these methods. 

	::

		{
			conlltags 		: [[(word, pos, BIO)]],
			nouns 			: [word],
			named_entities 		: [[word, pos, BIO]],
			cleansed_text 		: [[word]]
		}

	:param doc: dictionary of reddit corpus.
	:type doc: dict

	:param tagger: A pos tagger. 
	:type tagger: Tagger

	:returns: dict
	"""

    if "_id" in doc:
        del (doc["_id"])
    sentences = pos.tokenize_sents(doc["cleansed_text"])
    tags = pos.tokenize_words(sentences) or []
    doc["conlltags"] = []
    doc["nouns"] = []
    doc["named_entities"] = []
    for sent in tags:
        tagged_sent = nltk.pos_tag(sent) or []
        d = ne_chunk(tagged_sent) or []
        chunks = tree2conlltags(d)
        doc["conlltags"].append(chunks)
        doc["nouns"].extend(cnll.get_nouns(chunks))
        doc["named_entities"].extend(cnll.get_ne(chunks))
    return doc

Esempio n. 7

Mostra file

File: annotated_corpus.py Progetto: juchiyama/bigdata_fall2015

import rdt.nlp.ngrams as ngrams
import rdt.nlp.pos as pos
from nltk.chunk import ne_chunk
from nltk.chunk.util import tree2conlltags
import rdt.nlp.conll_get as cnll

if __name__ == "__main__":
    source = rdtcorp.Source(conf_key="source_test")
    annotated = rdtcorp.Source(conf_key="annotated_test")
    docs = source.find()
    docs.batch_size(1000)
    tagger = ngrams.make_backoff_tagger()
    buf = []
    for ind, doc in enumerate(clean.doc_iter(docs)):
        del (doc["_id"])
        sentences = pos.tokenize_sents(doc["cleansed_text"])
        tags = pos.tokenize_words(sentences)
        doc["conlltags"] = []
        doc["nouns"] = []
        doc["named_entities"] = []
        for sent in tags:
            tagged_sent = tagger.tag(sent)
            d = ne_chunk(tagged_sent)
            chunks = tree2conlltags(d)
            doc["conlltags"].append(chunks)
            doc["nouns"].extend(cnll.get_nouns(chunks))
            doc["named_entities"].extend(cnll.get_ne(chunks))
        buf.append(doc)
        if ind % 1000:
            annotated.insert(buf)
            buf = []