def __init__(self, corpus, document_titles, num_clusters = None, num_features = None): """ Computes the interpreter matrix by calculating the TF-IDF value of each token in each concept (doc) in corpus. document_titles give the names of each concept (doc) in corpus. num_features gives the number of features of corpus If num_clusters == None all documents are used as concepts. """ if not num_clusters: self.num_clusters = len(document_titles) else: self.num_clusters = num_clusters if num_features is None: logger.info("scanning corpus to determine the number of features") num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features #reduce column count by k-medoid clustering and using medoid of each cluster #TODO: skip clustering when num_clusters == None clusterer = KMedoids(corpus = corpus, num_features = self.num_features, num_clusters = self.num_clusters, max_iterations = 10) clusters = clusterer.cluster() #set the corpus to medoids #the corpus is our interpreter matrix. It is not sparse #each column is a doc and is seen as a concept self.corpus = clusterer.get_medoids().T #reduce document titles self.document_titles = DocumentTitles() for cluster_id in clusters.iterkeys(): self.document_titles.append(document_titles[cluster_id] or "no title") #print clusters with their members for cluster_id, members in clusters.iteritems(): cluster_title = document_titles[cluster_id] member_titles = ", ".join(document_titles[member_id] for member_id in members) logger.debug("%s: %s" % (cluster_title, member_titles))