def construct_term_doc_matrix(self, pca=False): ''' Constructs a term-document matrix such that td_matrix[document][term] contains the weighting score for the term in the document. ''' if not self.filter_terms: corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()]) else: corpus = nltk.TextCollection(self._filter_terms()) terms = list(set(corpus)) data_rows = numpy.zeros([len(self.document_dict), len(set(corpus))]) for i, document in enumerate(self.document_dict.values()): text = nltk.Text(document.tokens) for item in document.word_frequencies: data_rows[i][terms.index(item.word)] = corpus.tf_idf(item.word, text) #table = Orange.data.Table("iris.tab") self.attributes = terms#table.domain.features #a, c, w = table.to_numpy() self.td_matrix = data_rows#a #If PCA is True then we project our points on their principal components #for dimensionality reduction if pca: t = construct_orange_table(self.attributes, self.td_matrix) self.td_matrix = orange_pca(t) #Attributes names have no meaning after dimensionality reduction self.attributes = [i for i in range(self.td_matrix.shape[1])]
def plot_scatter(self): ''' Overrides the parent class method. Plots all the data points in 2D. ''' #Create a clusterer document list to get the index of a doc (horrible hack I know) clusterer_document_list = [key for key in self.document_dict.keys()] corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()]) all_terms_vector = list(set(corpus)) table = construct_orange_table(all_terms_vector) meta_col_name="cluster_id" table = add_metas_to_table(table, meta_col_name=meta_col_name) instances = [] for cluster in self.clusters: for doc_id, doc_content in cluster.document_dict.iteritems(): index = clusterer_document_list.index(doc_id) #We use index = 1 to force the function to construct the vector according to all the documents in the collection self.construct_term_doc_matrix(index, doc_content) oc = OnlineCluster(self.td_matrix, 1, doc_id, doc_content, self.attributes) oc.resize(all_terms_vector) inst = Orange.data.Instance(table.domain, list(oc.center)) inst[meta_col_name] = str(cluster.id) instances.insert(index, inst) #we have a table with the clusters ids as metas. table.extend(instances) from visualizations.mds import MDS mds = MDS(table) classes_list = [] for c in self.clusters: classes_list.append(c.id) mds.plot(classes_list=classes_list, class_col_name="cluster_id")
def train(self, train_set, attributes): ''' Gets the training data (numpy array) and the attribute list and constructs the tree ''' self.train_table = construct_orange_table(attributes, train_set, classed=True) treeLearner = orngTree.TreeLearner() self.classifer = treeClassifier = treeLearner(self.train_table)
def plot_scatter(self): ''' Plots all the data points in 2D. ''' assert self.attributes and self.td_matrix != None #Create a clusterer document list to get the index of a doc (horrible hack I know) clusterer_document_list = [key for key in self.document_dict.keys()] table = construct_orange_table(self.attributes) meta_col_name="cluster_id" table = add_metas_to_table(table, meta_col_name=meta_col_name) instances = [] for cluster in self.clusters: #A small hack for dbscan noise cluster. It's id is -1 and color scale is wrong if -1 if cluster.id == -1: cluster.id += len(self.clusters); for doc_id in cluster.document_dict.iterkeys(): index = clusterer_document_list.index(doc_id) inst = Orange.data.Instance(table.domain, list(self.td_matrix[index])) inst[meta_col_name] = str(cluster.id) instances.insert(index, inst) #we have a table with the clusters ids as metas. table.extend(instances) mds = MDS(table) classes_list = [] for c in self.clusters: classes_list.append(c.id) mds.plot(classes_list=classes_list, class_col_name="cluster_id")
def save_table(self, filename): ''' It stores the term-docuemtn matrix as a tab delimited file which is supported by Orange. ''' if self.td_matrix != None: t = construct_orange_table(self.attributes, self.td_matrix) t = add_metas_to_table(t, self.document_dict.keys()) orange.saveTabDelimited (filename+".tab", t) self.table_name = filename else: raise Exception("Oops. It seems that you have not constructed a term-document matrix. Use construct_term_document_matrix()")
def save_table(self, filename): ''' It stores the term-docuemtn matrix as a tab delimited file which is supported by Orange. ''' if self.td_matrix != None: t = construct_orange_table(self.attributes, self.td_matrix) t = add_metas_to_table(t, self.document_dict.keys()) orange.saveTabDelimited(filename + ".tab", t) self.table_name = filename else: raise Exception( "Oops. It seems that you have not constructed a term-document matrix. Use construct_term_document_matrix()" )
def test_author_classification_dummy_dataset(self): train_set = numpy.array([[0.2, 0.5, 0.2, 0.2, 0.1, 10., 0], [0.2, 0.3, 0.12, 0.1, 0.1, 10., 0], [0.2, 0.2, 0.08, 0.2, 0.01, 20., 0], [0.2, 0.5, 0.1, 0.1, 0.2, 5., 0], [0.2, 0.1, 0.2, 0.2, 0.3, 20., 0], [0.7, 0.5, 0.2, 0.8, 0.3, 0.1, 1], [0.6, 0.8, 5.2, 0.2, 0.6, 0.3, 1], [0.2, 0.6, 8.2, 0.9, 0.9, 0.1, 1], [0.5, 0.9, 1.2, 0.1, 0.1, 0.2, 1], [0.9, 0.1, 0.9, 0.6, 0.3, 0.6, 1]]) attributes = ["retweets", "links", "retweeted", "replies", "mentions", "ff-ratio", "class"] table = construct_orange_table(attributes, train_set, classed=True) treeLearner = orngTree.TreeLearner() treeClassifier = treeLearner(table) example = Orange.data.Instance(table.domain, [0.2, 0.5, 0.2, 0.2, 0.1, 100, 0]) prediction = treeClassifier(example) self.assertEquals(0, prediction.value)
def test_author_classification_dummy_dataset(self): train_set = numpy.array( [ [0.2, 0.5, 0.2, 0.2, 0.1, 10.0, 0], [0.2, 0.3, 0.12, 0.1, 0.1, 10.0, 0], [0.2, 0.2, 0.08, 0.2, 0.01, 20.0, 0], [0.2, 0.5, 0.1, 0.1, 0.2, 5.0, 0], [0.2, 0.1, 0.2, 0.2, 0.3, 20.0, 0], [0.7, 0.5, 0.2, 0.8, 0.3, 0.1, 1], [0.6, 0.8, 5.2, 0.2, 0.6, 0.3, 1], [0.2, 0.6, 8.2, 0.9, 0.9, 0.1, 1], [0.5, 0.9, 1.2, 0.1, 0.1, 0.2, 1], [0.9, 0.1, 0.9, 0.6, 0.3, 0.6, 1], ] ) attributes = ["retweets", "links", "retweeted", "replies", "mentions", "ff-ratio", "class"] table = construct_orange_table(attributes, train_set, classed=True) treeLearner = orngTree.TreeLearner() treeClassifier = treeLearner(table) example = Orange.data.Instance(table.domain, [0.2, 0.5, 0.2, 0.2, 0.1, 100, 0]) prediction = treeClassifier(example) self.assertEquals(0, prediction.value)
def plot_scatter(self): ''' Overrides the parent class method. Plots all the data points in 2D. ''' #Create a clusterer document list to get the index of a doc (horrible hack I know) clusterer_document_list = [key for key in self.document_dict.keys()] corpus = nltk.TextCollection( [document.tokens for document in self.document_dict.values()]) all_terms_vector = list(set(corpus)) table = construct_orange_table(all_terms_vector) meta_col_name = "cluster_id" table = add_metas_to_table(table, meta_col_name=meta_col_name) instances = [] for cluster in self.clusters: for doc_id, doc_content in cluster.document_dict.iteritems(): index = clusterer_document_list.index(doc_id) #We use index = 1 to force the function to construct the vector according to all the documents in the collection self.construct_term_doc_matrix(index, doc_content) oc = OnlineCluster(self.td_matrix, 1, doc_id, doc_content, self.attributes) oc.resize(all_terms_vector) inst = Orange.data.Instance(table.domain, list(oc.center)) inst[meta_col_name] = str(cluster.id) instances.insert(index, inst) #we have a table with the clusters ids as metas. table.extend(instances) from visualizations.mds import MDS mds = MDS(table) classes_list = [] for c in self.clusters: classes_list.append(c.id) mds.plot(classes_list=classes_list, class_col_name="cluster_id")