Exemple #1
0
 def __init__(self, corpus, document_titles, num_clusters = None,
              num_features = None):
     """
     Computes the interpreter matrix by calculating the TF-IDF value of each
     token in each concept (doc) in corpus.
     
     document_titles give the names of each concept (doc) in corpus.
     num_features gives the number of features of corpus
     
     If num_clusters == None all documents are used as concepts.
     """
     
     if not num_clusters:
         self.num_clusters = len(document_titles)
     else:
         self.num_clusters = num_clusters
     
     if num_features is None:
         logger.info("scanning corpus to determine the number of features")
         num_features = 1 + utils.get_max_id(corpus)
         
     self.num_features = num_features
     
     #reduce column count by k-medoid clustering and using medoid of each cluster
     #TODO: skip clustering when num_clusters == None
     clusterer = KMedoids(corpus = corpus, 
                          num_features = self.num_features,
                          num_clusters = self.num_clusters,
                          max_iterations = 10)
     clusters = clusterer.cluster()
     
     #set the corpus to medoids
     #the corpus is our interpreter matrix. It is not sparse
     #each column is a doc and is seen as a concept
     self.corpus = clusterer.get_medoids().T
     
     
     #reduce document titles
     self.document_titles = DocumentTitles()
     for cluster_id in clusters.iterkeys():
         self.document_titles.append(document_titles[cluster_id] or "no title")
         
     #print clusters with their members
     for cluster_id, members in clusters.iteritems():
         cluster_title = document_titles[cluster_id]
         member_titles = ", ".join(document_titles[member_id] 
                                   for member_id 
                                   in members)
         logger.debug("%s: %s" % (cluster_title, member_titles))