def plot_scatter(self): ''' Overrides the parent class method. Plots all the data points in 2D. ''' #Create a clusterer document list to get the index of a doc (horrible hack I know) clusterer_document_list = [key for key in self.document_dict.keys()] corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()]) all_terms_vector = list(set(corpus)) table = construct_orange_table(all_terms_vector) meta_col_name="cluster_id" table = add_metas_to_table(table, meta_col_name=meta_col_name) instances = [] for cluster in self.clusters: for doc_id, doc_content in cluster.document_dict.iteritems(): index = clusterer_document_list.index(doc_id) #We use index = 1 to force the function to construct the vector according to all the documents in the collection self.construct_term_doc_matrix(index, doc_content) oc = OnlineCluster(self.td_matrix, 1, doc_id, doc_content, self.attributes) oc.resize(all_terms_vector) inst = Orange.data.Instance(table.domain, list(oc.center)) inst[meta_col_name] = str(cluster.id) instances.insert(index, inst) #we have a table with the clusters ids as metas. table.extend(instances) from visualizations.mds import MDS mds = MDS(table) classes_list = [] for c in self.clusters: classes_list.append(c.id) mds.plot(classes_list=classes_list, class_col_name="cluster_id")
def plot_scatter(self): ''' Plots all the data points in 2D. ''' assert self.attributes and self.td_matrix != None #Create a clusterer document list to get the index of a doc (horrible hack I know) clusterer_document_list = [key for key in self.document_dict.keys()] table = construct_orange_table(self.attributes) meta_col_name="cluster_id" table = add_metas_to_table(table, meta_col_name=meta_col_name) instances = [] for cluster in self.clusters: #A small hack for dbscan noise cluster. It's id is -1 and color scale is wrong if -1 if cluster.id == -1: cluster.id += len(self.clusters); for doc_id in cluster.document_dict.iterkeys(): index = clusterer_document_list.index(doc_id) inst = Orange.data.Instance(table.domain, list(self.td_matrix[index])) inst[meta_col_name] = str(cluster.id) instances.insert(index, inst) #we have a table with the clusters ids as metas. table.extend(instances) mds = MDS(table) classes_list = [] for c in self.clusters: classes_list.append(c.id) mds.plot(classes_list=classes_list, class_col_name="cluster_id")
def save_table(self, filename): ''' It stores the term-docuemtn matrix as a tab delimited file which is supported by Orange. ''' if self.td_matrix != None: t = construct_orange_table(self.attributes, self.td_matrix) t = add_metas_to_table(t, self.document_dict.keys()) orange.saveTabDelimited (filename+".tab", t) self.table_name = filename else: raise Exception("Oops. It seems that you have not constructed a term-document matrix. Use construct_term_document_matrix()")
def save_table(self, filename): ''' It stores the term-docuemtn matrix as a tab delimited file which is supported by Orange. ''' if self.td_matrix != None: t = construct_orange_table(self.attributes, self.td_matrix) t = add_metas_to_table(t, self.document_dict.keys()) orange.saveTabDelimited(filename + ".tab", t) self.table_name = filename else: raise Exception( "Oops. It seems that you have not constructed a term-document matrix. Use construct_term_document_matrix()" )
def plot_scatter(self): ''' Overrides the parent class method. Plots all the data points in 2D. ''' #Create a clusterer document list to get the index of a doc (horrible hack I know) clusterer_document_list = [key for key in self.document_dict.keys()] corpus = nltk.TextCollection( [document.tokens for document in self.document_dict.values()]) all_terms_vector = list(set(corpus)) table = construct_orange_table(all_terms_vector) meta_col_name = "cluster_id" table = add_metas_to_table(table, meta_col_name=meta_col_name) instances = [] for cluster in self.clusters: for doc_id, doc_content in cluster.document_dict.iteritems(): index = clusterer_document_list.index(doc_id) #We use index = 1 to force the function to construct the vector according to all the documents in the collection self.construct_term_doc_matrix(index, doc_content) oc = OnlineCluster(self.td_matrix, 1, doc_id, doc_content, self.attributes) oc.resize(all_terms_vector) inst = Orange.data.Instance(table.domain, list(oc.center)) inst[meta_col_name] = str(cluster.id) instances.insert(index, inst) #we have a table with the clusters ids as metas. table.extend(instances) from visualizations.mds import MDS mds = MDS(table) classes_list = [] for c in self.clusters: classes_list.append(c.id) mds.plot(classes_list=classes_list, class_col_name="cluster_id")