Exemple #1
0
    def plot_scatter(self):
        '''
        Overrides the parent class method. Plots all the data points in 2D.
        '''
        #Create a clusterer document list to get the index of a doc (horrible hack I know)
        clusterer_document_list = [key for key in self.document_dict.keys()] 
        corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()])
        all_terms_vector = list(set(corpus))
        table = construct_orange_table(all_terms_vector)
        meta_col_name="cluster_id"
        table = add_metas_to_table(table, meta_col_name=meta_col_name)

        instances = []
        for cluster in self.clusters:
            for doc_id, doc_content in cluster.document_dict.iteritems():
                index = clusterer_document_list.index(doc_id)
                
                #We use index = 1 to force the function to construct the vector according to all the documents in the collection
                self.construct_term_doc_matrix(index, doc_content)
                oc = OnlineCluster(self.td_matrix, 1, doc_id, doc_content, self.attributes)
                oc.resize(all_terms_vector)
                inst = Orange.data.Instance(table.domain, list(oc.center))
                inst[meta_col_name] = str(cluster.id)
                instances.insert(index, inst)

        #we have a table with the clusters ids as metas.                
        table.extend(instances)        
        from visualizations.mds import MDS
        mds = MDS(table)
        classes_list = []
        for c in self.clusters:
            classes_list.append(c.id)   
                     
        mds.plot(classes_list=classes_list, class_col_name="cluster_id")
Exemple #2
0
    def plot_scatter(self):
        '''
        Plots all the data points in 2D.
        '''
        assert self.attributes and self.td_matrix != None
        
        #Create a clusterer document list to get the index of a doc (horrible hack I know)
        clusterer_document_list = [key for key in self.document_dict.keys()] 
        table = construct_orange_table(self.attributes)
        meta_col_name="cluster_id"
        table = add_metas_to_table(table, meta_col_name=meta_col_name)

        instances = []
        for cluster in self.clusters:
            #A small hack for dbscan noise cluster. It's id is -1 and color scale is  wrong if -1
            if cluster.id == -1: cluster.id += len(self.clusters);
            for doc_id in cluster.document_dict.iterkeys():
                index = clusterer_document_list.index(doc_id)
                inst = Orange.data.Instance(table.domain, list(self.td_matrix[index]))
                inst[meta_col_name] = str(cluster.id)
                instances.insert(index, inst)

        #we have a table with the clusters ids as metas.                
        table.extend(instances)        
        mds = MDS(table)
        classes_list = []
        for c in self.clusters:
            classes_list.append(c.id)

        mds.plot(classes_list=classes_list, class_col_name="cluster_id")
Exemple #3
0
 def save_table(self, filename):
     '''
     It stores the term-docuemtn matrix as a tab delimited file
     which is supported by Orange. 
     '''
     if self.td_matrix != None: 
         t = construct_orange_table(self.attributes, self.td_matrix)
         t = add_metas_to_table(t, self.document_dict.keys())
         orange.saveTabDelimited (filename+".tab", t)
         self.table_name = filename
     else:
         raise Exception("Oops. It seems that you have not constructed a term-document matrix. Use construct_term_document_matrix()")
Exemple #4
0
 def save_table(self, filename):
     '''
     It stores the term-docuemtn matrix as a tab delimited file
     which is supported by Orange. 
     '''
     if self.td_matrix != None:
         t = construct_orange_table(self.attributes, self.td_matrix)
         t = add_metas_to_table(t, self.document_dict.keys())
         orange.saveTabDelimited(filename + ".tab", t)
         self.table_name = filename
     else:
         raise Exception(
             "Oops. It seems that you have not constructed a term-document matrix. Use construct_term_document_matrix()"
         )
Exemple #5
0
    def plot_scatter(self):
        '''
        Overrides the parent class method. Plots all the data points in 2D.
        '''
        #Create a clusterer document list to get the index of a doc (horrible hack I know)
        clusterer_document_list = [key for key in self.document_dict.keys()]
        corpus = nltk.TextCollection(
            [document.tokens for document in self.document_dict.values()])
        all_terms_vector = list(set(corpus))
        table = construct_orange_table(all_terms_vector)
        meta_col_name = "cluster_id"
        table = add_metas_to_table(table, meta_col_name=meta_col_name)

        instances = []
        for cluster in self.clusters:
            for doc_id, doc_content in cluster.document_dict.iteritems():
                index = clusterer_document_list.index(doc_id)

                #We use index = 1 to force the function to construct the vector according to all the documents in the collection
                self.construct_term_doc_matrix(index, doc_content)
                oc = OnlineCluster(self.td_matrix, 1, doc_id, doc_content,
                                   self.attributes)
                oc.resize(all_terms_vector)
                inst = Orange.data.Instance(table.domain, list(oc.center))
                inst[meta_col_name] = str(cluster.id)
                instances.insert(index, inst)

        #we have a table with the clusters ids as metas.
        table.extend(instances)
        from visualizations.mds import MDS
        mds = MDS(table)
        classes_list = []
        for c in self.clusters:
            classes_list.append(c.id)

        mds.plot(classes_list=classes_list, class_col_name="cluster_id")