def test_tokenization(self):
        expected, sample_docs, objects = get_test_documents()
        calculated = {}
        analyser = TextAnalyser()
        id = 0
        for s in sample_docs:
            d = analyser.add_document(s)
            calculated[str(id)] = d
            id += 1

        self.assertEqual(expected, calculated)
 def test_tokenization(self):
     expected, sample_docs, objects = get_test_documents()
     calculated = {}
     analyser = TextAnalyser()
     id=0
     for s in sample_docs:
         d = analyser.add_document(s)
         calculated[str(id)] = d
         id+=1
         
     self.assertEqual(expected, calculated)
'''
Created on 26 Jan 2012

@author: george
'''
import unittest, numpy
from analysis.clustering.nmf import NMFClusterer
from tests.test_document import get_test_documents

expected, sample_docs_raw, samples = get_test_documents()

nmfc = NMFClusterer(ngram=1)
nmfc.add_documents(samples)


class Test(unittest.TestCase):
    def test_nmf_cluster(self):
        nmfc.run(seed='random_vcol',
                 method='nmf',
                 rank=2,
                 max_iter=65,
                 display_N_tokens=6,
                 display_N_documents=3)
        nmfc.dump_clusters_to_file("nmf_with_samples")


if __name__ == "__main__":
    unittest.main()
'''
Created on 26 Jan 2012

@author: george
'''
import unittest, numpy
from analysis.clustering.kmeans import OrangeKmeansClusterer
from tests.test_document import get_test_documents
###########################################
# GLOBALS                                #
###########################################
ignore, ignore, samples  =  get_test_documents()

oc = OrangeKmeansClusterer(k=2)        
for sample in samples:
    oc.add_document(sample)

class Test(unittest.TestCase):

    def test_orange_cluster_term_document_matrix(self):
        oc.construct_term_doc_matrix()
        calculated = oc.td_matrix
        expected = numpy.array([[ 0.31388923,  0.11584717,  0,           0,           0,           0,           0.47083384], 
                                [ 0,           0.13515504,  0.3662041,   0,           0.3662041,   0,           0         ],      
                                [ 0,           0,           0,           0.54930614,  0,           0.549306140, 0        ]])

        self.assertEqual(expected.all(), calculated.all())
        
    def test_orange_save_matrix_to_tab_file(self):
        oc.construct_term_doc_matrix()
        oc.save_table("sample_table_orange")
Exemple #5
0
'''
Created on 26 Jan 2012

@author: george
'''
import unittest, numpy
from analysis.clustering.kmeans import OrangeKmeansClusterer
from tests.test_document import get_test_documents
###########################################
# GLOBALS                                #
###########################################
ignore, ignore, samples = get_test_documents()

oc = OrangeKmeansClusterer(k=2)
for sample in samples:
    oc.add_document(sample)


class Test(unittest.TestCase):
    def test_orange_cluster_term_document_matrix(self):
        oc.construct_term_doc_matrix()
        calculated = oc.td_matrix
        expected = numpy.array(
            [[0.31388923, 0.11584717, 0, 0, 0, 0, 0.47083384],
             [0, 0.13515504, 0.3662041, 0, 0.3662041, 0, 0],
             [0, 0, 0, 0.54930614, 0, 0.549306140, 0]])

        self.assertEqual(expected.all(), calculated.all())

    def test_orange_save_matrix_to_tab_file(self):
        oc.construct_term_doc_matrix()
'''
Created on 26 Jan 2012

@author: george
'''
import unittest, numpy
from analysis.clustering.online import OnlineClusterer
from analysis.clustering.structures import OnlineCluster
from tests.test_document import get_test_documents, get_orange_clustering_test_data
###########################################
# GLOBALS                                #
###########################################
expected, sample_docs_raw, samples = get_test_documents()

class Test(unittest.TestCase):

    def test_sample_doc_clustering_with_online(self):
        oc = OnlineClusterer(N=2, window=3)        
        samples = get_orange_clustering_test_data()
        for document in samples:
            index = oc.add_document(document)
            oc.cluster(document)
        expected = [0, 0, 0, 1, 1, 1]
        for cluster in oc.clusters:
            print cluster.document_dict
        
    def test_cluster_term_document_matrix(self):
        oc = OnlineClusterer(N=2, window=3)        
        for document in samples:
            index = oc.add_document(document)
            oc.cluster(document)