def test_orange_with_tweets_kmeans(self):
     import time
     start = time.time()            
     from_date = datetime.datetime(2011, 1, 26, 0, 0, 0)
     to_date = datetime.datetime(2011, 1, 27, 0, 0, 0) 
     items = ws.get_documents_by_date(from_date, to_date, limit=1000)
     
     oc = OrangeKmeansClusterer(k=34, ngram=1)
     oc.add_documents(items)
     oc.run("orange_clustering_test", pca=False)
     print time.time() - start
     oc.plot_growth_timeline(cumulative=False)
     oc.plot_scatter()
     oc.dump_clusters_to_file("kmeans_with_tweets_orange")
@author: george

Unit tests for the analysis.clustering package.
'''
import datetime, unittest 
from database.warehouse import WarehouseServer
from analysis.clustering.kmeans import OrangeKmeansClusterer
from tests.test_document import get_orange_clustering_test_data

###########################################
# GLOBALS                                #
###########################################
ws = WarehouseServer()
sample_docs = get_orange_clustering_test_data()

oc = OrangeKmeansClusterer(k=2)
for s in sample_docs:
    oc.add_document(s)

class TestOrangeClustering(unittest.TestCase):
    
    ###########################################
    # ORANGE TESTS                            #
    ###########################################       
    def test_orange_sample_doc_kmeans(self):
        km = oc.run("orange_clustering_test")
        expected = [0, 0, 0, 1, 1, 1]
        self.assertEqual(expected, km.clusters)

    def test_orange_with_tweets_kmeans(self):
        import time
'''
Created on 26 Jan 2012

@author: george
'''
import unittest, numpy
from analysis.clustering.kmeans import OrangeKmeansClusterer
from tests.test_document import get_test_documents
###########################################
# GLOBALS                                #
###########################################
ignore, ignore, samples  =  get_test_documents()

oc = OrangeKmeansClusterer(k=2)        
for sample in samples:
    oc.add_document(sample)

class Test(unittest.TestCase):

    def test_orange_cluster_term_document_matrix(self):
        oc.construct_term_doc_matrix()
        calculated = oc.td_matrix
        expected = numpy.array([[ 0.31388923,  0.11584717,  0,           0,           0,           0,           0.47083384], 
                                [ 0,           0.13515504,  0.3662041,   0,           0.3662041,   0,           0         ],      
                                [ 0,           0,           0,           0.54930614,  0,           0.549306140, 0        ]])

        self.assertEqual(expected.all(), calculated.all())
        
    def test_orange_save_matrix_to_tab_file(self):
        oc.construct_term_doc_matrix()
        oc.save_table("sample_table_orange")
'''

import datetime, unittest 
from database.warehouse import WarehouseServer
from analysis.clustering.kmeans import OrangeKmeansClusterer
from tools.utils import aggregate_data
from matplotlib.dates import num2date#!@UnresolvedImport
from visualizations.graphs import D3Timeline


ws = WarehouseServer()
from_date = datetime.datetime(2011, 1, 26, 0, 0, 0)
to_date = datetime.datetime(2011, 1, 27, 0, 0, 0) 
items = ws.get_documents_by_date(from_date, to_date, limit=3000)

oc = OrangeKmeansClusterer(k=100, ngram=1)
oc.add_documents(items)
oc.run("orange_clustering_test", pca=False)

top_clusters = []
for cluster in oc.clusters:
    documents = cluster.get_documents().values()
    if len(documents) == 0 : continue
    dates = [doc.date for doc in documents]
    delta = max(dates) - min(dates)
    delta_seconds = delta.total_seconds()
    if delta_seconds == 0: continue
    rate_growth = float(len(dates))/delta_seconds
    top_clusters.append( (rate_growth, max(dates), cluster) )
    
top_clusters = sorted(top_clusters, key=lambda x: -x[0])[:20]
Exemple #5
0
'''
Created on 26 Jan 2012

@author: george
'''
import unittest, numpy
from analysis.clustering.kmeans import OrangeKmeansClusterer
from tests.test_document import get_test_documents
###########################################
# GLOBALS                                #
###########################################
ignore, ignore, samples = get_test_documents()

oc = OrangeKmeansClusterer(k=2)
for sample in samples:
    oc.add_document(sample)


class Test(unittest.TestCase):
    def test_orange_cluster_term_document_matrix(self):
        oc.construct_term_doc_matrix()
        calculated = oc.td_matrix
        expected = numpy.array(
            [[0.31388923, 0.11584717, 0, 0, 0, 0, 0.47083384],
             [0, 0.13515504, 0.3662041, 0, 0.3662041, 0, 0],
             [0, 0, 0, 0.54930614, 0, 0.549306140, 0]])

        self.assertEqual(expected.all(), calculated.all())

    def test_orange_save_matrix_to_tab_file(self):
        oc.construct_term_doc_matrix()
    def test_orange_with_tweets_kmeans(self):
        import time
        start = time.time()
        from_date = datetime.datetime(2011, 1, 25, 0, 0, 0)
        to_date = datetime.datetime(2011, 1, 26, 0, 0, 0)
        items = ws.get_documents_by_date(from_date, to_date, limit=1000)

        oc = OrangeKmeansClusterer(k=20, ngram=1)
        oc.add_documents(items)
        oc.run("orange_clustering_test", pca=True)
        print time.time() - start
        oc.plot_growth_timeline(cumulative=True)
        oc.plot_scatter()
        oc.dump_clusters_to_file("kmeans_with_tweets_orange")