Ejemplo n.º 1
0
import numpy as np
from nltk.corpus import brown
from chunking import splitter
from sklearn.feature_extration.text import CountVectorizer
if __name_ == '__main__':
	data = ' '.join(brown.words()[:10000])
	num_words = 2000
	chunks = []
	counter = 0
	text_chunks = splitter(data, num_words)
	for text in text_chunks:
		chunk = {'index':counter, 'text':text}
		chunks.append(chunk)
		counter += 1
	vectorizer = CountVectorizer(min_df=5, max_df=.95)
	doc_term_matrix = vectorizer.fit_transform([chunk['text'] for chunk in chunks])
	vocab = np.array(vectorizer.get_feature_names())
	print "\nVocabulary:"
	print vocab
	print "\nDocument term matrix:"
	chunk_names = ['chunk-0', 'chunk-1', 'chunk-2', 'chunk-3', 'chunk-4']
	formatted_row = '{:>12}'*(len(chunk_names)+1)
	print '\n', formatted_row.format('Word', *chunk_names), '\n'
	
	for word, item in zip(vocab, doc_term_matrix.T):
		output = [str(x) for x in item.data]
		print formatted_row.format(word, *output)
Ejemplo n.º 2
0
import numpy as np
from nltk.corpus import brown
from chunking import splitter

if __name__ == '__main__':
    # Read the data from the Brown corpus
    content = ' '.join(brown.words()[:10000])

    # Number of words in each chunk
    num_words = 2000

    chunks = []
    counter = 0

    text_chunks = splitter(content, num_words)

    for text in text_chunks:
        chunk = {'index': counter, 'text': text}
        chunks.append(chunk)
        counter += 1

    # Extract document term matrix
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer(min_df=5, max_df=.95)
    doc_term_matrix = vectorizer.fit_transform(
        [chunk['text'] for chunk in chunks])

    vocab = np.array(vectorizer.get_feature_names())
    print("\nVocabulary:")
    print(vocab)
Ejemplo n.º 3
0
import numpy as np
from nltk.corpus import brown
from chunking import splitter

import nltk
nltk.download('brown')

# define the main function and read the input data from Brown corpus
if __name__ == '__main__':
    content = ' '.join(brown.words()[:10000])

    # split the text into chunks
    num_of_words = 2000
    num_chunks = []
    count = 0
    texts_chunk = splitter(content, num_of_words)

    # build a vocabulary based on these text chunks
    for text in texts_chunk:
        num_chunk = {'index': count, 'text': text}
        num_chunks.append(num_chunk)
        count += 1

# extract a document matrix, which effectively counts the amount of incidences of each word in the document
from sklearn.feature_extraction.text import CountVectorizer

# extract the document term matrix
vectorizer = CountVectorizer(min_df=0.5, max_df=0.95)
matrix = vectorizer.fit_transform(
    [num_chunk['text'] for num_chunk in num_chunks])
import numpy as np
from nltk.corpus import brown
from chunking import splitter

if __name__=='__main__':
    # Read the data from the Brown corpus
    data = ' '.join(brown.words()[:10000])

    # Number of words in each chunk 
    num_words = 2000

    chunks = []
    counter = 0

    text_chunks = splitter(data, num_words)

    for text in text_chunks:
        chunk = {'index': counter, 'text': text}
        chunks.append(chunk)
        counter += 1

    # Extract document term matrix
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer(min_df=5, max_df=.95)
    doc_term_matrix = vectorizer.fit_transform([chunk['text'] for chunk in chunks])

    vocab = np.array(vectorizer.get_feature_names())
    print "\nVocabulary:"
    print vocab