def train_clusterer(model, commentList): count = 0 index2word_set = set(model.index2word) for commList in commentList.values(): for comm in commList: for word in comment_to_wordlist(comm.body, True): if word in index2word_set: count += 1 print count word_vectors = np.zeros((count, 300)) i = 0 for commList in commentList.values(): for comm in commList: for word in comment_to_wordlist(comm.body, True): if word in index2word_set: word_vectors[i] = model[word] i += 1 num_clusters = word_vectors.shape[0] / 20 print word_vectors.shape # Initalize a k-means object and use it to extract centroids #idx = KMeans( n_clusters = num_clusters).fit_predict( word_vectors ) centroids,_ = kmeans(word_vectors, num_clusters,) idx,_ = vq(word_vectors,centroids) print "TRAINED" word_centroid_map = dict(zip( model.index2word, idx )) # For the first 10 clusters for cluster in xrange(0,10): # # Print the cluster number print "\nCluster %d" % cluster # # Find all of the words for that cluster number, and print them out words = [] for i in xrange(0,len(word_centroid_map.values())): if( word_centroid_map.values()[i] == cluster ): words.append(word_centroid_map.keys()[i]) print words return word_centroid_map, num_clusters
def tfidf_weighted_sum_features(model, df_comments): M = model.syn0.shape[1] N = df_comments.shape[0] feature_matrix = np.empty((N,M), dtype="float32") index = 0 index2word_set = set(model.index2word) global_body=[] # Create bag of words for comment for _,row in df_comments.iterrows(): comm = row['comment_content'] global_body += comment_to_wordlist(comm, True) global_freq_dist = nltk.FreqDist(global_body) IN_COUNT = 1 OUT_COUNT = 1 for _,row in df_comments.iterrows(): comm = row['comment_content'] sum = np.zeros((M,), dtype="float32") clean_comm = comment_to_wordlist(comm, True) fdist = nltk.FreqDist(clean_comm) for word in clean_comm: tf = 0.5 + 0.5*float(fdist[word])/np.max(fdist.values()) count = global_freq_dist[word] idf = math.log(N / float(1+count)) if word in index2word_set: #print "IN - ", word IN_COUNT += 1 # TFIDF weighted vector sum += tf * idf * model[word] else: #print "OUT - ", word OUT_COUNT += 1 feature_matrix[index] = sum index += 1 print IN_COUNT, 'words found' print OUT_COUNT, 'words not found' return feature_matrix
def bag_of_centroids_features(commentList, commentCount, num_clusters, centroid_map): # Pre-allocate an array for the training set bags of centroids (for speed) feature_matrix = np.zeros( (getCommentCount(commentList), num_clusters), \ dtype="float32" ) # Transform the training set reviews into bags of centroids index = 0 for commList in commentList.values(): for comm in commList: feature_matrix[index] = create_bag_of_centroids(comment_to_wordlist(comm.body, True), \ centroid_map ) index += 1 return feature_matrix
def train_clusterer(model, commentList): count = 0 corpus = [] words = [] index2word_set = set(model.index2word) for commList in commentList.values(): for comm in commList: tokens = [] for word in comment_to_wordlist(comm.body, True): if word in index2word_set: tokens.append(word) words.append(word) count += 1 corpus.append(' '.join(tokens)) fdist1 = FreqDist(words) most_common = [item[0] for item in fdist1.most_common(10)] uniques = Counter(words).keys() print fdist1 print len(most_common) print len(uniques) wordlist = [item for item in uniques if item not in most_common] N = len(wordlist) print N word_vectors = np.zeros((N, 200)) i = 0 for word in wordlist: if word not in most_common: word_vectors[i] = model[word] i += 1 num_clusters = word_vectors.shape[0] / 10 print word_vectors.shape print num_clusters # Initalize a k-means object and use it to extract centroids #idx = KMeans( n_clusters = num_clusters).fit_predict( word_vectors ) centroids,_ = kmeans(word_vectors, num_clusters,) idx,_ = vq(word_vectors,centroids) print "TRAINED" word_centroid_map = dict(zip( model.index2word, idx )) # For the first 10 clusters for cluster in xrange(0,10): # # Print the cluster number print "\nCluster %d" % cluster # # Find all of the words for that cluster number, and print them out words = [] for i in xrange(0,len(word_centroid_map.values())): if( word_centroid_map.values()[i] == cluster ): words.append(word_centroid_map.keys()[i]) print words return word_centroid_map, num_clusters
def __iter__(self): for line in open(self.file): body = line.split("|")[1] yield comment_to_wordlist(body, remove_stops=True)