import numpy as np from nltk.corpus import brown from chunking import splitter from sklearn.feature_extration.text import CountVectorizer if __name_ == '__main__': data = ' '.join(brown.words()[:10000]) num_words = 2000 chunks = [] counter = 0 text_chunks = splitter(data, num_words) for text in text_chunks: chunk = {'index':counter, 'text':text} chunks.append(chunk) counter += 1 vectorizer = CountVectorizer(min_df=5, max_df=.95) doc_term_matrix = vectorizer.fit_transform([chunk['text'] for chunk in chunks]) vocab = np.array(vectorizer.get_feature_names()) print "\nVocabulary:" print vocab print "\nDocument term matrix:" chunk_names = ['chunk-0', 'chunk-1', 'chunk-2', 'chunk-3', 'chunk-4'] formatted_row = '{:>12}'*(len(chunk_names)+1) print '\n', formatted_row.format('Word', *chunk_names), '\n' for word, item in zip(vocab, doc_term_matrix.T): output = [str(x) for x in item.data] print formatted_row.format(word, *output)
import numpy as np from nltk.corpus import brown from chunking import splitter if __name__ == '__main__': # Read the data from the Brown corpus content = ' '.join(brown.words()[:10000]) # Number of words in each chunk num_words = 2000 chunks = [] counter = 0 text_chunks = splitter(content, num_words) for text in text_chunks: chunk = {'index': counter, 'text': text} chunks.append(chunk) counter += 1 # Extract document term matrix from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=5, max_df=.95) doc_term_matrix = vectorizer.fit_transform( [chunk['text'] for chunk in chunks]) vocab = np.array(vectorizer.get_feature_names()) print("\nVocabulary:") print(vocab)
import numpy as np from nltk.corpus import brown from chunking import splitter import nltk nltk.download('brown') # define the main function and read the input data from Brown corpus if __name__ == '__main__': content = ' '.join(brown.words()[:10000]) # split the text into chunks num_of_words = 2000 num_chunks = [] count = 0 texts_chunk = splitter(content, num_of_words) # build a vocabulary based on these text chunks for text in texts_chunk: num_chunk = {'index': count, 'text': text} num_chunks.append(num_chunk) count += 1 # extract a document matrix, which effectively counts the amount of incidences of each word in the document from sklearn.feature_extraction.text import CountVectorizer # extract the document term matrix vectorizer = CountVectorizer(min_df=0.5, max_df=0.95) matrix = vectorizer.fit_transform( [num_chunk['text'] for num_chunk in num_chunks])
import numpy as np from nltk.corpus import brown from chunking import splitter if __name__=='__main__': # Read the data from the Brown corpus data = ' '.join(brown.words()[:10000]) # Number of words in each chunk num_words = 2000 chunks = [] counter = 0 text_chunks = splitter(data, num_words) for text in text_chunks: chunk = {'index': counter, 'text': text} chunks.append(chunk) counter += 1 # Extract document term matrix from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=5, max_df=.95) doc_term_matrix = vectorizer.fit_transform([chunk['text'] for chunk in chunks]) vocab = np.array(vectorizer.get_feature_names()) print "\nVocabulary:" print vocab