def get_volcabulary_and_list_words(data): reviews_words = [] volcabulary = [] for review in data["text"]: review_words = Word2VecUtility.review_to_wordlist( review, remove_stopwords=True) reviews_words.append(review_words) for word in review_words: volcabulary.append(word) volcabulary = set(volcabulary) return volcabulary, reviews_words
# print 'Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...' # nltk.download() # Download text data sets, including stop words # # Initialize an empty list to hold the clean reviews clean_train_reviews = [] # # Loop over each review print "Cleaning and parsing the training set reviews...\n" num_reviews = len(train["review"]) for i in xrange(0, num_reviews): if ((i + 1) % 10000 == 0): print "Processing review %d out of %d" % (i + 1, num_reviews) clean_train_reviews.append(" ".join( Word2VecUtility.review_to_wordlist(train["review"][i], True))) # ****** Create a bag of words from the training set print "Creating the bag of words...\n" # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000) # limit the most frequent 5000 words # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of
'/Users/XW/Desktop/datascience.stackexchange.com/Posts.xml') post = [(i.attrib.get("PostTypeId"), i.attrib.get("CreationDate"), i.attrib.get("Body")) for i in post_tree.getroot() if i.attrib.get("PostTypeId") == '2' and i.attrib.get("Id") not in aaId and i.attrib.get("Id") not in delId] post_frame = DataFrame(post, columns=['PostTypeId', 'CreationDate', 'Body']) post_body = post_frame.loc[:, 'Body'] clean_post = [] print "Cleaning and parsing the posts...\n" for i in xrange(0, len(post_body)): tmp = BeautifulSoup(post_body[i].replace('\n', ""), 'html.parser').get_text() if tmp == '': continue clean_post = " ".join(Word2VecUtility.review_to_wordlist(tmp, True)) f = file('/Users/XW/Desktop/datascience.stackexchange.com/parse/' + str(i), 'w') f.write(clean_post.encode('utf-8')) import textmining import os xDIR = '/Users/XW/Desktop/datascience.stackexchange.com/parse' def termdocumentmatrix_example(xDIR): # Initialize class to create term-document matrix count = 0 tdm = textmining.TermDocumentMatrix() for i in os.listdir(xDIR):
#test= raw_input() print ('\n\nPlease uncomment nltk.download() to download text data sets \n') #nltk.download() # Download text data sets, including stop words # Initialize an empty list to hold the clean symptoms and summary clean_train_symptom = [] clean_train_summary ={} print ("Cleaning and parsing the training set symptoms...\n") for i in xrange( 0, len(train["symptom"])): clean_train_symptom.append(" ".join(Word2VecUtility.symptoms_to_wordlist(train["symptom"][i],True))) print ("Cleaning and parsing the training set summary...\n") for i in xrange( 0, len(train["summary"])): clean_train_summary[train["disease"][i]] = "".join(Word2VecUtility.summary_to_wordlist(train["summary"][i])) # ****** Create a bag of words from the training set # print ("Creating the bag of words...\n") # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \
words_set = set(model.index2word) word2index = { word : (i + index_from) for i,word in enumerate(words_set) } index2word = { i : word for word, i in list(word2index.items()) } index2word[0] = '0' index2word[1] = '1' index2word[2] = '2' # 'Word2Vec' object does not support item assignment padding_model = {} padding_model['0'] = np.random.standard_normal(num_features) padding_model['1'] = np.random.standard_normal(num_features) padding_model['2'] = np.random.standard_normal(num_features) reviews_words = [] for review in data["text"]: review_words = Word2VecUtility.review_to_wordlist(review, remove_stopwords = True) # each word index increased with 3. review_words = [start] + [word2index[w] if (w in words_set) else oov for w in review_words] # review_words = [oov if (ix > (max_words + index_from)) else ix for ix in review_words] reviews_words.append(review_words) # padding with 0, each review has max_length now. reviews_words = sequence.pad_sequences(reviews_words, maxlen = max_length, padding='post', truncating='post') print(reviews_words.shape) # In[47]: data_matrix = np.empty((reviews_words.shape[0], max_length, num_features)) print(data_matrix.shape)
def getCleanReviews(reviews): clean_reviews = [] for review in reviews["Paper_content"]: clean_reviews.append( Word2VecUtility.review_to_wordlist(review, remove_stopwords=True)) return clean_reviews
# In[7]: # print data.ix[0:10] print((data.iloc[:10]['text'])) # print data['text'][2] # In[8]: review_sents = [] print ("Cleaning and parsing the reviews...\n") for i in range( 0, len(data["text"])): # sent_reviews += Word2VecUtility.review_to_sentences(data["text"][i], tokenizer) review_sents += Word2VecUtility.review_to_sentences(data.iloc[i]["text"], tokenizer) # In[53]: out = open('review_sents_1859888.pkl', 'wb') pickle.dump(review_sents, out) out.close() # In[11]: # review_sents = pickle.load(open('review_sents_1859888.pkl', 'rb'))
import numpy as np if __name__ == '__main__': train = pd.read_csv(os.path.join(os.path.dirname(__file__)), 'data', 'labeledTrainData.tsv', header = 0,\ delimiter = "\t", quoting=3) test = pd.read_csv(os.path.join(os.path.dirname(__file__)), 'data', 'testData.tsv', header = 0,\ delimiter = "\t", quoting=3) print 'The first review is:' print train["review"][0] raw_input("Press Enter to continue..") # Initialize an empty list to hold clean_reviews clean_train_reviews = [] # Loop over each review; create an index i that goes from 0 to the length of the movie review list print "Cleaning and parsing the training set movie reviews...\n" for i in xrange(0, len(train["review"])): clean_train_reviews.append("".join(Word2VecUtility.review_to_wordlist(train["review"][i], True))) # ****** Create a bag of words from the training set print "Creating the bag of words...\n" # Initialize CountVectorizer object which is Scikit-learn bag of words tool vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features=5000) # fit_transform does two functions: First it fits the model and learns the vocabulary # second; it transforms our training data into feature vectors. The input should be a list of strings train_data_features = vectorizer.fit_transform(clean_train_reviews) # Convert to numpy array train_data_features = train_data_features.toarray()
from Word2VecUtility import Word2VecUtility import sklearn import sklearn.feature_extraction post_tree = ET.parse( '/Users/Zhen/Desktop/Courses/BigData/stackexchange/data/Posts.xml') post = [(i.attrib.get("PostTypeId"), i.attrib.get("CreationDate"), i.attrib.get("Body")) for i in post_tree.getroot()] post_frame = DataFrame(post, columns=['PostTypeId', 'CreationDate', 'Body']) post_body = post_frame.loc[:, 'Body'] clean_post = [] print "Cleaning and parsing the posts...\n" for i in xrange(0, len(post_body)): clean_post.append(" ".join( Word2VecUtility.review_to_wordlist(post_body[i], True))) clean_postdf = pd.DataFrame(clean_post) clean_postdf.to_csv('post_body.csv', sep=',', encoding='utf-8') # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = sklearn.feature_extraction.text.CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000,min_df=1) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings. data_features = vectorizer.fit_transform(clean_post)
from time import mktime import sys sys.path.insert(0, '/Users/Zhen/Desktop/Courses/BigData/stackexchange/') from Word2VecUtility import Word2VecUtility import sklearn import sklearn.feature_extraction post_tree=ET.parse('/Users/Zhen/Desktop/Courses/BigData/stackexchange/data/Posts.xml') post=[(i.attrib.get("PostTypeId"),i.attrib.get("CreationDate"),i.attrib.get("Body") ) for i in post_tree.getroot()] post_frame=DataFrame(post,columns=['PostTypeId','CreationDate','Body']) post_body=post_frame.loc[:,'Body'] clean_post = [] print "Cleaning and parsing the posts...\n" for i in xrange( 0, len(post_body)): clean_post.append(" ".join(Word2VecUtility.review_to_wordlist(post_body[i], True))) clean_postdf=pd.DataFrame(clean_post) clean_postdf.to_csv('post_body.csv',sep=',',encoding = 'utf-8') # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = sklearn.feature_extraction.text.CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000,min_df=1) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings.
word2index = { word : (i + index_from) for i,word in enumerate(words_set) } index2word = { i : word for word, i in word2index.items() } index2word[0] = '0' index2word[1] = '1' index2word[2] = '2' # 'Word2Vec' object does not support item assignment padding_model = {} padding_model['0'] = np.random.standard_normal(num_features) padding_model['1'] = np.random.standard_normal(num_features) padding_model['2'] = np.random.standard_normal(num_features) data = pd.read_csv('review_sub_399850.tsv', header=0, delimiter="\t", quoting=3, encoding='utf-8') reviews_words = [] for review in data["text"]: review_words = Word2VecUtility.review_to_wordlist(review, remove_stopwords = True) # each word index has already been increased by 3. review_words = [start] + [word2index[w] if (w in model) else oov for w in review_words] # index from 0,1,... to 5002 review_words = [oov if (ix >= (max_words + index_from)) else ix for ix in review_words] reviews_words.append(review_words) # padding with 0, each review has max_length now. reviews_words = sequence.pad_sequences(reviews_words, maxlen = max_length, padding='post', truncating='post') print reviews_words[:20, :12] print reviews_words.shape labels = data["stars"] # print labels[:10], labels.shape labels[labels <= 3] = 0
# Read data from files train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data2', 'labeledTrainData.csv'), header=0, delimiter="\t", quoting=3) test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data2', 'testData.csv'), header=0, delimiter="\t", quoting=3) print "Cleaning training content" clean_train_reviews = [] for review in train["Paper_content"]: clean_train_reviews.append( Word2VecUtility.review_to_wordlist( review, \ remove_stopwords=True )) print "Cleaning test content" clean_test_reviews = [] for review in test["Paper_content"]: clean_test_reviews.append( Word2VecUtility.review_to_wordlist( review, \ remove_stopwords=True )) # ****** Create bags of centroids # # Pre-allocate an array for the training set bags of centroids (for speed) train_centroids = np.zeros( (train["Paper_content"].size, num_clusters), \ dtype="float32" ) # Transform the training set reviews into bags of centroids counter = 0
# print train["stars"][0] # print 'Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...' # nltk.download() # Download text data sets, including stop words # # Initialize an empty list to hold the clean reviews clean_train_reviews = [] # # Loop over each review print "Cleaning and parsing the training set reviews...\n" num_reviews = len(train["review"]) for i in xrange( 0, num_reviews): if( (i+1)%10000 == 0 ): print "Processing review %d out of %d" % ( i+1, num_reviews ) clean_train_reviews.append(" ".join(Word2VecUtility.review_to_wordlist(train["review"][i], True))) # ****** Create a bag of words from the training set print "Creating the bag of words...\n" # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000) # limit the most frequent 5000 words # fit_transform() does two functions: First, it fits the model
delId = np.array(delId).tolist() delId = [str(int(i)) for i in delId] post_tree=ET.parse('/Users/XW/Desktop/datascience.stackexchange.com/Posts.xml') post=[(i.attrib.get("PostTypeId"),i.attrib.get("CreationDate"),i.attrib.get("Body") ) for i in post_tree.getroot() if i.attrib.get("PostTypeId") =='2' and i.attrib.get("Id") not in aaId and i.attrib.get("Id") not in delId] post_frame=DataFrame(post,columns=['PostTypeId','CreationDate','Body']) post_body=post_frame.loc[:,'Body'] clean_post = [] print "Cleaning and parsing the posts...\n" for i in xrange( 0, len(post_body)): tmp=BeautifulSoup(post_body[i].replace('\n',""),'html.parser').get_text() if tmp=='': continue clean_post=" ".join(Word2VecUtility.review_to_wordlist(tmp, True)) f = file('/Users/XW/Desktop/datascience.stackexchange.com/parse/' + str(i), 'w') f.write(clean_post.encode('utf-8')) import textmining import os xDIR = '/Users/XW/Desktop/datascience.stackexchange.com/parse' def termdocumentmatrix_example(xDIR): # Initialize class to create term-document matrix count=0 tdm = textmining.TermDocumentMatrix() for i in os.listdir(xDIR): Res = tdm.add_doc(open(os.path.join(xDIR,i)).read())
# Verify the number of reviews that were read (100,000 in total) print "Read %d labeled train reviews, %d labeled test reviews, " \ "and %d unlabeled reviews\n" % (train["Paper_content"].size, test["Paper_content"].size, unlabeled_train["Paper_content"].size ) # Load the punkt tokenizer tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # ****** Split the labeled and unlabeled training sets into clean sentences # sentences = [] # Initialize an empty list of sentences print "Parsing sentences from training set" for review in train["Paper_content"]: sentences += Word2VecUtility.review_to_sentences(review, tokenizer) print "Parsing sentences from unlabeled set" for review in unlabeled_train["Paper_content"]: sentences += Word2VecUtility.review_to_sentences(review, tokenizer) # ****** Set parameters and train the word2vec model # # Import the built-in logging module and configure it so that Word2Vec # creates nice output messages logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) # Set values for various parameters num_features = 300 # Word vector dimensionality min_word_count = 15 # Minimum word count
#input("Press Enter to continue...") #print ('Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...') #nltk.download() # Download text data sets, including stop words # Initialize an empty list to hold the clean reviews clean_train_reviews = [] # Loop over each review; create an index i that goes from 0 to the length # of the movie review list print("Cleaning and parsing the training set movie reviews...\n") for i in range(0, len(train["review"])): clean_train_reviews.append(" ".join( Word2VecUtility.review_to_wordlist(train["review"][i], True))) # ****** Create a bag of words from the training set # print("Creating the bag of words...\n") # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data
# In[7]: # print data.ix[0:10] print data.iloc[:10]['text'] # print data['text'][2] # In[8]: review_sents = [] print "Cleaning and parsing the reviews...\n" for i in xrange( 0, len(data["text"])): # sent_reviews += Word2VecUtility.review_to_sentences(data["text"][i], tokenizer) review_sents += Word2VecUtility.review_to_sentences(data.iloc[i]["text"], tokenizer) # # In[53]: out = open('review_sents_1859888.pkl', 'wb') pickle.dump(review_sents, out) out.close() # # In[11]: review_sents = pickle.load(open('review_sents_1859888.pkl', 'rb')) print len(review_sents) print review_sents[:5]