Beispiel #1
0
    def plot_scatter(self):
        '''
        Plots all the data points in 2D.
        '''
        assert self.attributes and self.td_matrix != None
        
        #Create a clusterer document list to get the index of a doc (horrible hack I know)
        clusterer_document_list = [key for key in self.document_dict.keys()] 
        table = construct_orange_table(self.attributes)
        meta_col_name="cluster_id"
        table = add_metas_to_table(table, meta_col_name=meta_col_name)

        instances = []
        for cluster in self.clusters:
            #A small hack for dbscan noise cluster. It's id is -1 and color scale is  wrong if -1
            if cluster.id == -1: cluster.id += len(self.clusters);
            for doc_id in cluster.document_dict.iterkeys():
                index = clusterer_document_list.index(doc_id)
                inst = Orange.data.Instance(table.domain, list(self.td_matrix[index]))
                inst[meta_col_name] = str(cluster.id)
                instances.insert(index, inst)

        #we have a table with the clusters ids as metas.                
        table.extend(instances)        
        mds = MDS(table)
        classes_list = []
        for c in self.clusters:
            classes_list.append(c.id)

        mds.plot(classes_list=classes_list, class_col_name="cluster_id")
Beispiel #2
0
    def plot_scatter(self):
        '''
        Overrides the parent class method. Plots all the data points in 2D.
        '''
        #Create a clusterer document list to get the index of a doc (horrible hack I know)
        clusterer_document_list = [key for key in self.document_dict.keys()] 
        corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()])
        all_terms_vector = list(set(corpus))
        table = construct_orange_table(all_terms_vector)
        meta_col_name="cluster_id"
        table = add_metas_to_table(table, meta_col_name=meta_col_name)

        instances = []
        for cluster in self.clusters:
            for doc_id, doc_content in cluster.document_dict.iteritems():
                index = clusterer_document_list.index(doc_id)
                
                #We use index = 1 to force the function to construct the vector according to all the documents in the collection
                self.construct_term_doc_matrix(index, doc_content)
                oc = OnlineCluster(self.td_matrix, 1, doc_id, doc_content, self.attributes)
                oc.resize(all_terms_vector)
                inst = Orange.data.Instance(table.domain, list(oc.center))
                inst[meta_col_name] = str(cluster.id)
                instances.insert(index, inst)

        #we have a table with the clusters ids as metas.                
        table.extend(instances)        
        from visualizations.mds import MDS
        mds = MDS(table)
        classes_list = []
        for c in self.clusters:
            classes_list.append(c.id)   
                     
        mds.plot(classes_list=classes_list, class_col_name="cluster_id")
Beispiel #3
0
    def plot_scatter(self):
        '''
        Overrides the parent class method. Plots all the data points in 2D.
        '''
        #Create a clusterer document list to get the index of a doc (horrible hack I know)
        clusterer_document_list = [key for key in self.document_dict.keys()]
        corpus = nltk.TextCollection(
            [document.tokens for document in self.document_dict.values()])
        all_terms_vector = list(set(corpus))
        table = construct_orange_table(all_terms_vector)
        meta_col_name = "cluster_id"
        table = add_metas_to_table(table, meta_col_name=meta_col_name)

        instances = []
        for cluster in self.clusters:
            for doc_id, doc_content in cluster.document_dict.iteritems():
                index = clusterer_document_list.index(doc_id)

                #We use index = 1 to force the function to construct the vector according to all the documents in the collection
                self.construct_term_doc_matrix(index, doc_content)
                oc = OnlineCluster(self.td_matrix, 1, doc_id, doc_content,
                                   self.attributes)
                oc.resize(all_terms_vector)
                inst = Orange.data.Instance(table.domain, list(oc.center))
                inst[meta_col_name] = str(cluster.id)
                instances.insert(index, inst)

        #we have a table with the clusters ids as metas.
        table.extend(instances)
        from visualizations.mds import MDS
        mds = MDS(table)
        classes_list = []
        for c in self.clusters:
            classes_list.append(c.id)

        mds.plot(classes_list=classes_list, class_col_name="cluster_id")