def keyWordsCluster_KMeansTFIDF(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth): """ Uses `createKeyWords` to form clusters of words post TF IDF (optional). Args: log_directory (str): Location of the logs (Assumed to be arranged in directory structure as : <year>/<month>/<day>/<log-file-for-channel>.txt) channel_name (str): Channel to be perform analysis on output_directory (str): Location of output directory startingDate (int): Date to start the analysis (in conjunction with startingMonth) startingMonth (int): Date to start the analysis (in conjunction with startingDate) endingDate (int): Date to end the analysis (in conjunction with endingMonth) endingMonth (int): Date to end the analysis (in conjunction with endingDate) Returns: null """ do_SVD = False words_to_show_per_cluster = 10 elbow_method_for_finding_K = False '''NON ELBOW''' number_of_clusters = 11 #elbow for jan-2013 = '''ELBOW SETTINGS''' check_k_till = 20 ''' MANUALLY CREATING A MATRIX ''' # each user's normalised frequency stored in rows # all the keywords (unfiltered) # ''' # keyword_list = [] # user_list = [] # keyword_dict_list, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words = CKW.createKeyWords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth) # for dictionary in user_keyword_freq_dict: # # print dictionary['keywords'] # keyword_list = list(set(keyword_list + [x[0] for x in dictionary['keywords']])) # user_list.append(dictionary['nick']) # # print "\n \n \n", "KEYWORDS_LIST", keyword_list # # print "\n \n \n", "USER_LIST", user_list # #GENERATE A MATRIX WITH USERS AS ROWS AND KEYWORDS AS COLUMNS # user_keyword_matrix = np.zeros(shape=(len(user_list), len(keyword_list))) # # user_keyword_matrix = [[0]*len(keyword_list) for _ in xrange(len(user_list))] # for dictionary in user_keyword_freq_dict: # # print dictionary['nick'], user_list.index(dictionary['nick']) # for word_tuple in dictionary['keywords']: # # print word_tuple, keyword_list.index(word_tuple[0]) # user_keyword_matrix[user_list.index(dictionary['nick'])][keyword_list.index(word_tuple[0])] += word_tuple[1] # print user_keyword_matrix # transformer = TfidfTransformer() # tfidf = transformer.fit_transform(user_keyword_matrix) # tfIDFMatrix = tfidf.toarray() # print np.nonzero(tfIDFMatrix) # # Each row is normalized to have unit euclidean norm. # # The weights of each feature computed by the fit method call are stored in a model attribute: # print "Weights of each feature", transformer.idf_ # for i in xrange(len(transformer.idf_)): # print keyword_list[i], transformer.idf_[i] # # ''' AUTO TFIDF FROM JUST SENTENCES ''' #http://scikit-learn.org/stable/auto_examples/text/document_clustering.html #BUILDING CORPUS keyword_dict_list, user_keyword_freq_dict, user_words_dict_list, nicks_for_stop_words = CKW.createKeyWords( log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth) corpus = [] for user_words_dict in user_words_dict_list: # print "SENDER", user_words_dict['sender'] # print "WORDS", " ".join(user_words_dict['words']) corpus.append(" ".join(map(str, user_words_dict['words']))) print "No. of users", len(corpus) #TF_IDF stop_word_without_apostrophe = [] for words in common_english_words.words: stop_word_without_apostrophe.append(words.replace("'", "")) stop_words_extended = text.ENGLISH_STOP_WORDS.union( common_english_words.words).union(nicks_for_stop_words).union( stop_word_without_apostrophe).union(custom_stop_words.words).union( custom_stop_words.slangs) vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=stop_words_extended, use_idf=True) print "Extracting features from the training dataset using TF-IDF" t0 = time() tf_idf = vectorizer.fit_transform(corpus) print("done in %fs" % (time() - t0)) print "n_samples: %d, n_features: %d \n" % tf_idf.shape # LSA if do_SVD: print("============USING SVD==========") print("Performing dimensionality reduction using LSA") t0 = time() # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. svd = TruncatedSVD(100) #recommened value = 100 normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) tf_idf = lsa.fit_transform(tf_idf) print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) if not elbow_method_for_finding_K: # CLUSTERING km = KMeans(n_clusters=number_of_clusters, init='k-means++', random_state=3465, max_iter=100, n_init=8) print("Clustering sparse data with %s" % km) t0 = time() km.fit(tf_idf) print("done in %0.3fs" % (time() - t0)) print("Top terms per cluster:") if do_SVD: original_space_centroids = svd.inverse_transform( km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] np.set_printoptions(threshold=np.nan) terms = vectorizer.get_feature_names() for i in range(number_of_clusters): print("Cluster %d:" % i) for ind in order_centroids[i, :words_to_show_per_cluster]: print terms[ind] + "\t" + str( round(km.cluster_centers_[i][ind], 2)) print "" else: print "============ELBOW METHOD =============" sum_squared_errors_list = [] avg_sum_squared_errors_list = [] for i in xrange(1, check_k_till + 1): print "\n===>> K = ", i km = KMeans(n_clusters=i, init='k-means++', max_iter=100, n_init=8) t0 = time() km.fit(tf_idf) if do_SVD: original_space_centroids = svd.inverse_transform( km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] distance_matrix_all_combination = cdist(tf_idf, km.cluster_centers_, 'euclidean') # cIdx = np.argmin(distance_matrix_all_combination,axis=1) distance_from_nearest_centroid = np.min( distance_matrix_all_combination, axis=1) sum_squared_errors = sum(distance_from_nearest_centroid) avg_sum_squared_errors = sum_squared_errors / tf_idf.shape[0] print "Sum Squared Error =", sum_squared_errors print "Avg Sum Squared Error =", avg_sum_squared_errors sum_squared_errors_list.append(sum_squared_errors) avg_sum_squared_errors_list.append(avg_sum_squared_errors) print("Top terms per cluster:") terms = vectorizer.get_feature_names() for i in range(i): print("Cluster %d:" % i) for ind in order_centroids[i, :words_to_show_per_cluster]: print(' %s' % terms[ind]) print() plt.plot(range(1, check_k_till + 1), sum_squared_errors_list, 'b*-') # ax.plot(K[kIdx], avgWithinSS[kIdx], marker='o', markersize=12, # markeredgewidth=2, markeredgecolor='r', markerfacecolor='None') plt.grid(True) plt.xlabel('Number of clusters') plt.ylabel('Average sum of squares') plt.title('Elbow for KMeans clustering') plt.savefig(output_directory + 'key-words/' + 'elbow_KMeans.png') plt.show() #NOTE RANDOM OUTPUTS BECAUSE OF RANDOM INITIALISATION print "NOTE RANDOM OUTPUTS BECAUSE OF RANDOM INITIALISATION"
import createKeyWords as CKW log_directory = "/home/rohan/parser_files/2013/" channel_name= "#kubuntu-devel" #channel name output_directory = "/home/rohan/parser_files/Output/" startingDate = 1 startingMonth = 1 endingDate = 4 endingMonth = 2 user_list = [] keyword_list = [] keyword_dict_list, user_keyword_freq_dict = CKW.createKeyWords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth) for dictionary in keyword_dict_list: user_list.append(dictionary['nick']) keyword_list = list(set(keyword_list + dictionary['keywords'])) keyword_user_binary_matrix = [[0 for i in xrange(len(user_list))] for x in xrange(len(keyword_list))] # print user_list # print keyword_list # print keyword_user_binary_matrix for user in user_list: key_words_for_users = filter(lambda keywords_user: keywords_user['nick'] == user, keyword_dict_list)[0]['keywords'] for word in key_words_for_users: keyword_user_binary_matrix[keyword_list.index(word)][user_list.index(user)] = 1 print user_list, "\n"
def fuzzyCMeans(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth): """[Deprecated] Fuzzy C Means clustering on key-words instead of KMeans """ do_SVD = True words_to_show_per_cluster = 20 number_of_clusters = 8 keyword_dict_list, user_keyword_freq_dict, user_words_dict_list, nicks_for_stop_words = CKW.createKeyWords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth) corpus = [] for user_words_dict in user_words_dict_list: # print "SENDER", user_words_dict['sender'] # print "WORDS", " ".join(user_words_dict['words']) corpus.append(" ".join(map(str,user_words_dict['words']))) print "No. of users", len(corpus) #TF_IDF stop_word_without_apostrophe=[] for words in common_english_words.words: stop_word_without_apostrophe.append(words.replace("'","")) stop_words_extended = text.ENGLISH_STOP_WORDS.union(common_english_words.words).union(nicks_for_stop_words).union(stop_word_without_apostrophe).union(custom_stop_words.words).union(custom_stop_words.slangs) vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=stop_words_extended, use_idf=True) print "Extracting features from the training dataset using TF-IDF" t0 = time() tf_idf = vectorizer.fit_transform(corpus) print("done in %fs" % (time() - t0)) print "n_samples: %d, n_features: %d \n" % tf_idf.shape # LSA if do_SVD: print("============USING SVD==========") print("Performing dimensionality reduction using LSA") t0 = time() # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. svd = TruncatedSVD(100) #recommened value = 100 normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) tf_idf = lsa.fit_transform(tf_idf) print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) np.set_printoptions(threshold=np.inf) #clusters tf_idf_transpose = tf_idf.T #c-means takes the transpose centroids, U, U0, d, Jm, p, fpc = fuzz.cluster.cmeans( tf_idf_transpose, number_of_clusters, 2., error=0.005, maxiter=1000, init=None) print "CENTROIDS", centroids if do_SVD: original_space_centroids = svd.inverse_transform(centroids) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = centroids.argsort()[:, ::-1] print "original_space_centroids", original_space_centroids print "order_centroids", order_centroids terms = vectorizer.get_feature_names() for i in range(number_of_clusters): print("Cluster %d:" % i) for ind in order_centroids[i, :words_to_show_per_cluster]: print(' %s' % terms[ind]) print()
def svdOnKeywords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth): """[Deprecated] uses createKeyWords function and then tries to form clusters by extracting more meaningful keywords. Performs a Singular Value Decomposition(SVD) after doing a Term Frequency–Inverse Document Frequency(TF-IDF).tered) """ keyword_list = [] user_list = [] keyword_dict_list, user_keyword_freq_dict = CKW.createKeyWords( log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth) for dictionary in user_keyword_freq_dict: # print dictionary['keywords'] keyword_list = list( set(keyword_list + [x[0] for x in dictionary['keywords']])) # print user_keyword_freq_dict #(Format : [<word>, <frequency>, <normalised_score>])' user_keyword_normalfreq_matrix = [] user_keyword_freq_matrix_for_doc_ = [] keyword_for_user = [] for user_tuple in user_keyword_freq_dict: nick = user_tuple['nick'] keywords = user_tuple['keywords'] user_list.append(nick) N = 0 temp = 0 '''calculete N = (summation of ni**2)**1/2''' for keyword in keywords: temp += keyword[1]**2 N = math.sqrt(temp) temp = [] keyword_normal_freq_for_user = [0 for i in range(len(keyword_list)) ] #to be used as column for keyword_tuple in keywords: keyword = keyword_tuple[0] normal_freq = keyword_tuple[1] / N keyword_normal_freq_for_user[keyword_list.index( keyword)] = normal_freq for i in range(0, keyword_tuple[1]): temp.append(keyword) keyword_for_user.append(temp) user_keyword_normalfreq_matrix.append(keyword_normal_freq_for_user) # print len(user_list) # print len(keyword_list) # print keyword_for_user # print user_keyword_normalfreq_matrix # print len(user_keyword_normalfreq_matrix ) ''' IF-IDF https://stanford.edu/~rjweiss/public_html/IRiSS2013/text2/notebooks/tfidf.html ''' mydoclist = keyword_for_user vocabulary = keyword_list doc_term_matrix = [] def l2_normalizer(vec): denom = numpy.sum([el**2 for el in vec]) return [(el / math.sqrt(denom)) for el in vec] def tf(term, document): return freq(term, document) def freq(term, document): return document.count(term) for doc in mydoclist: print('The doc is "' + ",".join(doc) + '"') tf_vector = [tf(word, doc) for word in vocabulary] tf_vector_string = ', '.join(format(freq, 'd') for freq in tf_vector) print('The tf vector for Document %d is [%s]' % ((mydoclist.index(doc) + 1), tf_vector_string)) doc_term_matrix.append(tf_vector) def numDocsContaining(word, doclist): doccount = 0 for doc in doclist: if freq(word, doc) > 0: doccount += 1 return doccount def idf(word, doclist): n_samples = len(doclist) df = numDocsContaining(word, doclist) return numpy.log(n_samples / 1 + df) my_idf_vector = [idf(word, mydoclist) for word in vocabulary] # print 'Our vocabulary vector is [' + ', '.join(list(vocabulary)) + ']' # print 'The inverse document frequency vector is [' + ', '.join(format(freq, 'f') for freq in my_idf_vector) + ']' def build_idf_matrix(idf_vector): idf_mat = numpy.zeros((len(idf_vector), len(idf_vector))) numpy.fill_diagonal(idf_mat, idf_vector) return idf_mat my_idf_matrix = build_idf_matrix(my_idf_vector) print("idf-matrix", my_idf_matrix) # Now we have converted our IDF vector into a matrix of size BxB, where the diagonal is the IDF vector. That means we can perform now multiply every term frequency vector by the inverse document frequency matrix. Then to make sure we are also accounting for words that appear too frequently within documents, we'll normalize each document such that the L2 norm = 1. doc_term_matrix_tfidf = [] #performing tf-idf matrix multiplication for tf_vector in doc_term_matrix: doc_term_matrix_tfidf.append(numpy.dot(tf_vector, my_idf_matrix)) #normalizing doc_term_matrix_tfidf_l2 = [] for tf_vector in doc_term_matrix_tfidf: doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector)) print(vocabulary) print(doc_term_matrix_tfidf_l2 ) # np.matrix() just to make it easier to look at ''' SVD ''' # clusterer = nltk.cluster.util.VectorSpaceClusterer(normalise=False, svd_dimensions=25)#http://www.nltk.org/_modules/nltk/cluster/util.html # clusterer.cluster(user_keyword_normalfreq_matrix) #borrow cluster code from http://www.nltk.org/_modules/nltk/cluster/util.html svd_dimensions = 5 # vectors = user_keyword_normalfreq_matrix # vectors = doc_term_matrix_tfidf_l2 vectors = doc_term_matrix_tfidf if svd_dimensions and svd_dimensions < len(vectors[0]): [u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors))) S = d[:svd_dimensions] * \ numpy.identity(svd_dimensions, numpy.float64) T = u[:, :svd_dimensions] Dt = vt[:svd_dimensions, :] vectors = numpy.transpose(numpy.dot(S, Dt)) print("S", S) print("T", T) print("Dt", Dt)
def keyWordsCluster_KMeansTFIDF( log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth ): do_SVD = False words_to_show_per_cluster = 10 elbow_method_for_finding_K = False """NON ELBOW""" number_of_clusters = 11 # elbow for jan-2013 = """ELBOW SETTINGS""" check_k_till = 20 """ MANUALLY CREATING A MATRIX """ # each user's normalised frequency stored in rows # all the keywords (unfiltered) # ''' # keyword_list = [] # user_list = [] # keyword_dict_list, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words = CKW.createKeyWords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth) # for dictionary in user_keyword_freq_dict: # # print dictionary['keywords'] # keyword_list = list(set(keyword_list + [x[0] for x in dictionary['keywords']])) # user_list.append(dictionary['nick']) # # print "\n \n \n", "KEYWORDS_LIST", keyword_list # # print "\n \n \n", "USER_LIST", user_list # #GENERATE A MATRIX WITH USERS AS ROWS AND KEYWORDS AS COLUMNS # user_keyword_matrix = np.zeros(shape=(len(user_list), len(keyword_list))) # # user_keyword_matrix = [[0]*len(keyword_list) for _ in xrange(len(user_list))] # for dictionary in user_keyword_freq_dict: # # print dictionary['nick'], user_list.index(dictionary['nick']) # for word_tuple in dictionary['keywords']: # # print word_tuple, keyword_list.index(word_tuple[0]) # user_keyword_matrix[user_list.index(dictionary['nick'])][keyword_list.index(word_tuple[0])] += word_tuple[1] # print user_keyword_matrix # transformer = TfidfTransformer() # tfidf = transformer.fit_transform(user_keyword_matrix) # tfIDFMatrix = tfidf.toarray() # print np.nonzero(tfIDFMatrix) # # Each row is normalized to have unit euclidean norm. # # The weights of each feature computed by the fit method call are stored in a model attribute: # print "Weights of each feature", transformer.idf_ # for i in xrange(len(transformer.idf_)): # print keyword_list[i], transformer.idf_[i] # # """ AUTO TFIDF FROM JUST SENTENCES """ # http://scikit-learn.org/stable/auto_examples/text/document_clustering.html # BUILDING CORPUS keyword_dict_list, user_keyword_freq_dict, user_words_dict_list, nicks_for_stop_words = CKW.createKeyWords( log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth ) corpus = [] for user_words_dict in user_words_dict_list: # print "SENDER", user_words_dict['sender'] # print "WORDS", " ".join(user_words_dict['words']) corpus.append(" ".join(map(str, user_words_dict["words"]))) print "No. of users", len(corpus) # TF_IDF stop_word_without_apostrophe = [] for words in common_english_words.words: stop_word_without_apostrophe.append(words.replace("'", "")) stop_words_extended = ( text.ENGLISH_STOP_WORDS.union(common_english_words.words) .union(nicks_for_stop_words) .union(stop_word_without_apostrophe) .union(custom_stop_words.words) .union(custom_stop_words.slangs) ) vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=stop_words_extended, use_idf=True) print "Extracting features from the training dataset using TF-IDF" t0 = time() tf_idf = vectorizer.fit_transform(corpus) print ("done in %fs" % (time() - t0)) print "n_samples: %d, n_features: %d \n" % tf_idf.shape # LSA if do_SVD: print ("============USING SVD==========") print ("Performing dimensionality reduction using LSA") t0 = time() # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. svd = TruncatedSVD(100) # recommened value = 100 normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) tf_idf = lsa.fit_transform(tf_idf) print ("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print ("Explained variance of the SVD step: {}%".format(int(explained_variance * 100))) if not elbow_method_for_finding_K: # CLUSTERING km = KMeans(n_clusters=number_of_clusters, init="k-means++", random_state=3465, max_iter=100, n_init=8) print ("Clustering sparse data with %s" % km) t0 = time() km.fit(tf_idf) print ("done in %0.3fs" % (time() - t0)) print ("Top terms per cluster:") if do_SVD: original_space_centroids = svd.inverse_transform(km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] np.set_printoptions(threshold=np.nan) terms = vectorizer.get_feature_names() for i in range(number_of_clusters): print ("Cluster %d:" % i) for ind in order_centroids[i, :words_to_show_per_cluster]: print terms[ind] + "\t" + str(round(km.cluster_centers_[i][ind], 2)) print "" else: print "============ELBOW METHOD =============" sum_squared_errors_list = [] avg_sum_squared_errors_list = [] for i in xrange(1, check_k_till + 1): print "\n===>> K = ", i km = KMeans(n_clusters=i, init="k-means++", max_iter=100, n_init=8) t0 = time() km.fit(tf_idf) if do_SVD: original_space_centroids = svd.inverse_transform(km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] distance_matrix_all_combination = cdist(tf_idf, km.cluster_centers_, "euclidean") # cIdx = np.argmin(distance_matrix_all_combination,axis=1) distance_from_nearest_centroid = np.min(distance_matrix_all_combination, axis=1) sum_squared_errors = sum(distance_from_nearest_centroid) avg_sum_squared_errors = sum_squared_errors / tf_idf.shape[0] print "Sum Squared Error =", sum_squared_errors print "Avg Sum Squared Error =", avg_sum_squared_errors sum_squared_errors_list.append(sum_squared_errors) avg_sum_squared_errors_list.append(avg_sum_squared_errors) print ("Top terms per cluster:") terms = vectorizer.get_feature_names() for i in range(i): print ("Cluster %d:" % i) for ind in order_centroids[i, :words_to_show_per_cluster]: print (" %s" % terms[ind]) print () plt.plot(range(1, check_k_till + 1), sum_squared_errors_list, "b*-") # ax.plot(K[kIdx], avgWithinSS[kIdx], marker='o', markersize=12, # markeredgewidth=2, markeredgecolor='r', markerfacecolor='None') plt.grid(True) plt.xlabel("Number of clusters") plt.ylabel("Average sum of squares") plt.title("Elbow for KMeans clustering") plt.savefig(output_directory + "key-words/" + "elbow_KMeans.png") plt.show() # NOTE RANDOM OUTPUTS BECAUSE OF RANDOM INITIALISATION print "NOTE RANDOM OUTPUTS BECAUSE OF RANDOM INITIALISATION"
def svdOnKeywords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth): """[Deprecated] uses createKeyWords function and then tries to form clusters by extracting more meaningful keywords. Performs a Singular Value Decomposition(SVD) after doing a Term Frequency–Inverse Document Frequency(TF-IDF).tered) """ keyword_list = [] user_list = [] keyword_dict_list, user_keyword_freq_dict = CKW.createKeyWords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth) for dictionary in user_keyword_freq_dict: # print dictionary['keywords'] keyword_list = list(set(keyword_list + [x[0] for x in dictionary['keywords']])) # print user_keyword_freq_dict #(Format : [<word>, <frequency>, <normalised_score>])' user_keyword_normalfreq_matrix = [] user_keyword_freq_matrix_for_doc_ = [] keyword_for_user = [] for user_tuple in user_keyword_freq_dict: nick = user_tuple['nick'] keywords = user_tuple['keywords'] user_list.append(nick) N = 0 temp = 0 '''calculete N = (summation of ni**2)**1/2''' for keyword in keywords: temp += keyword[1]**2 N = math.sqrt(temp) temp = [] keyword_normal_freq_for_user = [0 for i in xrange(len(keyword_list))] #to be used as column for keyword_tuple in keywords: keyword = keyword_tuple[0] normal_freq = keyword_tuple[1]/N keyword_normal_freq_for_user[keyword_list.index(keyword)] = normal_freq for i in xrange(0,keyword_tuple[1]): temp.append(keyword) keyword_for_user.append(temp) user_keyword_normalfreq_matrix.append(keyword_normal_freq_for_user) # print len(user_list) # print len(keyword_list) # print keyword_for_user # print user_keyword_normalfreq_matrix # print len(user_keyword_normalfreq_matrix ) ''' IF-IDF https://stanford.edu/~rjweiss/public_html/IRiSS2013/text2/notebooks/tfidf.html ''' mydoclist = keyword_for_user vocabulary = keyword_list doc_term_matrix = [] def l2_normalizer(vec): denom = numpy.sum([el**2 for el in vec]) return [(el / math.sqrt(denom)) for el in vec] def tf(term, document): return freq(term, document) def freq(term, document): return document.count(term) for doc in mydoclist: print 'The doc is "' + ",".join(doc)+ '"' tf_vector = [tf(word, doc) for word in vocabulary] tf_vector_string = ', '.join(format(freq, 'd') for freq in tf_vector) print 'The tf vector for Document %d is [%s]' % ((mydoclist.index(doc)+1), tf_vector_string) doc_term_matrix.append(tf_vector) def numDocsContaining(word, doclist): doccount = 0 for doc in doclist: if freq(word, doc) > 0: doccount +=1 return doccount def idf(word, doclist): n_samples = len(doclist) df = numDocsContaining(word, doclist) return numpy.log(n_samples / 1+df) my_idf_vector = [idf(word, mydoclist) for word in vocabulary] # print 'Our vocabulary vector is [' + ', '.join(list(vocabulary)) + ']' # print 'The inverse document frequency vector is [' + ', '.join(format(freq, 'f') for freq in my_idf_vector) + ']' def build_idf_matrix(idf_vector): idf_mat = numpy.zeros((len(idf_vector), len(idf_vector))) numpy.fill_diagonal(idf_mat, idf_vector) return idf_mat my_idf_matrix = build_idf_matrix(my_idf_vector) print "idf-matrix" , my_idf_matrix # Now we have converted our IDF vector into a matrix of size BxB, where the diagonal is the IDF vector. That means we can perform now multiply every term frequency vector by the inverse document frequency matrix. Then to make sure we are also accounting for words that appear too frequently within documents, we'll normalize each document such that the L2 norm = 1. doc_term_matrix_tfidf = [] #performing tf-idf matrix multiplication for tf_vector in doc_term_matrix: doc_term_matrix_tfidf.append(numpy.dot(tf_vector, my_idf_matrix)) #normalizing doc_term_matrix_tfidf_l2 = [] for tf_vector in doc_term_matrix_tfidf: doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector)) print vocabulary print doc_term_matrix_tfidf_l2# np.matrix() just to make it easier to look at ''' SVD ''' # clusterer = nltk.cluster.util.VectorSpaceClusterer(normalise=False, svd_dimensions=25)#http://www.nltk.org/_modules/nltk/cluster/util.html # clusterer.cluster(user_keyword_normalfreq_matrix) #borrow cluster code from http://www.nltk.org/_modules/nltk/cluster/util.html svd_dimensions = 5 # vectors = user_keyword_normalfreq_matrix # vectors = doc_term_matrix_tfidf_l2 vectors = doc_term_matrix_tfidf if svd_dimensions and svd_dimensions < len(vectors[0]): [u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors))) S = d[:svd_dimensions] * \ numpy.identity(svd_dimensions, numpy.float64) T = u[:,:svd_dimensions] Dt = vt[:svd_dimensions,:] vectors = numpy.transpose(numpy.dot(S, Dt)) print "S", S print "T", T print "Dt", Dt