Beispiel #1
0
 def construct_term_doc_matrix(self, pca=False):
     '''
     Constructs a term-document matrix such that td_matrix[document][term] 
     contains the weighting score for the term in the document.
     '''
     if not self.filter_terms:    
         corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()])
     else:
         corpus = nltk.TextCollection(self._filter_terms())
         
     terms = list(set(corpus))
     data_rows = numpy.zeros([len(self.document_dict), len(set(corpus))])
     
     for i, document in enumerate(self.document_dict.values()):
         text = nltk.Text(document.tokens)
         for item in document.word_frequencies:
             data_rows[i][terms.index(item.word)] = corpus.tf_idf(item.word, text)
     
     
     #table = Orange.data.Table("iris.tab")
     self.attributes = terms#table.domain.features
     #a, c, w = table.to_numpy()        
     self.td_matrix = data_rows#a
             
     #If PCA is True then we project our points on their principal components
     #for dimensionality reduction
     if pca:
         t = construct_orange_table(self.attributes, self.td_matrix)
         self.td_matrix = orange_pca(t)
         #Attributes names have no meaning after dimensionality reduction
         self.attributes = [i for i in range(self.td_matrix.shape[1])]
Beispiel #2
0
    def plot_scatter(self):
        '''
        Overrides the parent class method. Plots all the data points in 2D.
        '''
        #Create a clusterer document list to get the index of a doc (horrible hack I know)
        clusterer_document_list = [key for key in self.document_dict.keys()] 
        corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()])
        all_terms_vector = list(set(corpus))
        table = construct_orange_table(all_terms_vector)
        meta_col_name="cluster_id"
        table = add_metas_to_table(table, meta_col_name=meta_col_name)

        instances = []
        for cluster in self.clusters:
            for doc_id, doc_content in cluster.document_dict.iteritems():
                index = clusterer_document_list.index(doc_id)
                
                #We use index = 1 to force the function to construct the vector according to all the documents in the collection
                self.construct_term_doc_matrix(index, doc_content)
                oc = OnlineCluster(self.td_matrix, 1, doc_id, doc_content, self.attributes)
                oc.resize(all_terms_vector)
                inst = Orange.data.Instance(table.domain, list(oc.center))
                inst[meta_col_name] = str(cluster.id)
                instances.insert(index, inst)

        #we have a table with the clusters ids as metas.                
        table.extend(instances)        
        from visualizations.mds import MDS
        mds = MDS(table)
        classes_list = []
        for c in self.clusters:
            classes_list.append(c.id)   
                     
        mds.plot(classes_list=classes_list, class_col_name="cluster_id")
Beispiel #3
0
 def train(self, train_set, attributes):
     '''
     Gets the training data (numpy array) and the attribute list and constructs the tree
     '''
     self.train_table = construct_orange_table(attributes, train_set, classed=True)
     treeLearner = orngTree.TreeLearner()        
     self.classifer = treeClassifier = treeLearner(self.train_table) 
Beispiel #4
0
    def plot_scatter(self):
        '''
        Plots all the data points in 2D.
        '''
        assert self.attributes and self.td_matrix != None
        
        #Create a clusterer document list to get the index of a doc (horrible hack I know)
        clusterer_document_list = [key for key in self.document_dict.keys()] 
        table = construct_orange_table(self.attributes)
        meta_col_name="cluster_id"
        table = add_metas_to_table(table, meta_col_name=meta_col_name)

        instances = []
        for cluster in self.clusters:
            #A small hack for dbscan noise cluster. It's id is -1 and color scale is  wrong if -1
            if cluster.id == -1: cluster.id += len(self.clusters);
            for doc_id in cluster.document_dict.iterkeys():
                index = clusterer_document_list.index(doc_id)
                inst = Orange.data.Instance(table.domain, list(self.td_matrix[index]))
                inst[meta_col_name] = str(cluster.id)
                instances.insert(index, inst)

        #we have a table with the clusters ids as metas.                
        table.extend(instances)        
        mds = MDS(table)
        classes_list = []
        for c in self.clusters:
            classes_list.append(c.id)

        mds.plot(classes_list=classes_list, class_col_name="cluster_id")
Beispiel #5
0
 def save_table(self, filename):
     '''
     It stores the term-docuemtn matrix as a tab delimited file
     which is supported by Orange. 
     '''
     if self.td_matrix != None: 
         t = construct_orange_table(self.attributes, self.td_matrix)
         t = add_metas_to_table(t, self.document_dict.keys())
         orange.saveTabDelimited (filename+".tab", t)
         self.table_name = filename
     else:
         raise Exception("Oops. It seems that you have not constructed a term-document matrix. Use construct_term_document_matrix()")
Beispiel #6
0
 def save_table(self, filename):
     '''
     It stores the term-docuemtn matrix as a tab delimited file
     which is supported by Orange. 
     '''
     if self.td_matrix != None:
         t = construct_orange_table(self.attributes, self.td_matrix)
         t = add_metas_to_table(t, self.document_dict.keys())
         orange.saveTabDelimited(filename + ".tab", t)
         self.table_name = filename
     else:
         raise Exception(
             "Oops. It seems that you have not constructed a term-document matrix. Use construct_term_document_matrix()"
         )
Beispiel #7
0
 def test_author_classification_dummy_dataset(self):
     train_set = numpy.array([[0.2, 0.5, 0.2,  0.2, 0.1,  10.,  0],
                             [0.2, 0.3, 0.12, 0.1, 0.1,  10.,  0],
                             [0.2, 0.2, 0.08, 0.2, 0.01, 20.,  0],
                             [0.2, 0.5, 0.1,  0.1, 0.2,  5.,   0],
                             [0.2, 0.1, 0.2,  0.2, 0.3,  20.,  0],
                             [0.7, 0.5, 0.2,  0.8, 0.3,  0.1, 1],
                             [0.6, 0.8, 5.2,  0.2, 0.6,  0.3, 1],
                             [0.2, 0.6, 8.2,  0.9, 0.9,  0.1, 1],
                             [0.5, 0.9, 1.2,  0.1, 0.1,  0.2, 1],
                             [0.9, 0.1, 0.9,  0.6, 0.3,  0.6, 1]])
     
     attributes = ["retweets", "links", "retweeted", "replies", "mentions", "ff-ratio", "class"]
     
     table = construct_orange_table(attributes, train_set, classed=True)
     treeLearner = orngTree.TreeLearner()        
     treeClassifier = treeLearner(table)    
     example = Orange.data.Instance(table.domain, [0.2, 0.5, 0.2,  0.2, 0.1,  100,  0])    
     prediction = treeClassifier(example)
     self.assertEquals(0, prediction.value)
    def test_author_classification_dummy_dataset(self):
        train_set = numpy.array(
            [
                [0.2, 0.5, 0.2, 0.2, 0.1, 10.0, 0],
                [0.2, 0.3, 0.12, 0.1, 0.1, 10.0, 0],
                [0.2, 0.2, 0.08, 0.2, 0.01, 20.0, 0],
                [0.2, 0.5, 0.1, 0.1, 0.2, 5.0, 0],
                [0.2, 0.1, 0.2, 0.2, 0.3, 20.0, 0],
                [0.7, 0.5, 0.2, 0.8, 0.3, 0.1, 1],
                [0.6, 0.8, 5.2, 0.2, 0.6, 0.3, 1],
                [0.2, 0.6, 8.2, 0.9, 0.9, 0.1, 1],
                [0.5, 0.9, 1.2, 0.1, 0.1, 0.2, 1],
                [0.9, 0.1, 0.9, 0.6, 0.3, 0.6, 1],
            ]
        )

        attributes = ["retweets", "links", "retweeted", "replies", "mentions", "ff-ratio", "class"]

        table = construct_orange_table(attributes, train_set, classed=True)
        treeLearner = orngTree.TreeLearner()
        treeClassifier = treeLearner(table)
        example = Orange.data.Instance(table.domain, [0.2, 0.5, 0.2, 0.2, 0.1, 100, 0])
        prediction = treeClassifier(example)
        self.assertEquals(0, prediction.value)
Beispiel #9
0
    def plot_scatter(self):
        '''
        Overrides the parent class method. Plots all the data points in 2D.
        '''
        #Create a clusterer document list to get the index of a doc (horrible hack I know)
        clusterer_document_list = [key for key in self.document_dict.keys()]
        corpus = nltk.TextCollection(
            [document.tokens for document in self.document_dict.values()])
        all_terms_vector = list(set(corpus))
        table = construct_orange_table(all_terms_vector)
        meta_col_name = "cluster_id"
        table = add_metas_to_table(table, meta_col_name=meta_col_name)

        instances = []
        for cluster in self.clusters:
            for doc_id, doc_content in cluster.document_dict.iteritems():
                index = clusterer_document_list.index(doc_id)

                #We use index = 1 to force the function to construct the vector according to all the documents in the collection
                self.construct_term_doc_matrix(index, doc_content)
                oc = OnlineCluster(self.td_matrix, 1, doc_id, doc_content,
                                   self.attributes)
                oc.resize(all_terms_vector)
                inst = Orange.data.Instance(table.domain, list(oc.center))
                inst[meta_col_name] = str(cluster.id)
                instances.insert(index, inst)

        #we have a table with the clusters ids as metas.
        table.extend(instances)
        from visualizations.mds import MDS
        mds = MDS(table)
        classes_list = []
        for c in self.clusters:
            classes_list.append(c.id)

        mds.plot(classes_list=classes_list, class_col_name="cluster_id")