def test_orange_with_tweets_kmeans(self): import time start = time.time() from_date = datetime.datetime(2011, 1, 26, 0, 0, 0) to_date = datetime.datetime(2011, 1, 27, 0, 0, 0) items = ws.get_documents_by_date(from_date, to_date, limit=1000) oc = OrangeKmeansClusterer(k=34, ngram=1) oc.add_documents(items) oc.run("orange_clustering_test", pca=False) print time.time() - start oc.plot_growth_timeline(cumulative=False) oc.plot_scatter() oc.dump_clusters_to_file("kmeans_with_tweets_orange")
@author: george Unit tests for the analysis.clustering package. ''' import datetime, unittest from database.warehouse import WarehouseServer from analysis.clustering.kmeans import OrangeKmeansClusterer from tests.test_document import get_orange_clustering_test_data ########################################### # GLOBALS # ########################################### ws = WarehouseServer() sample_docs = get_orange_clustering_test_data() oc = OrangeKmeansClusterer(k=2) for s in sample_docs: oc.add_document(s) class TestOrangeClustering(unittest.TestCase): ########################################### # ORANGE TESTS # ########################################### def test_orange_sample_doc_kmeans(self): km = oc.run("orange_clustering_test") expected = [0, 0, 0, 1, 1, 1] self.assertEqual(expected, km.clusters) def test_orange_with_tweets_kmeans(self): import time
''' Created on 26 Jan 2012 @author: george ''' import unittest, numpy from analysis.clustering.kmeans import OrangeKmeansClusterer from tests.test_document import get_test_documents ########################################### # GLOBALS # ########################################### ignore, ignore, samples = get_test_documents() oc = OrangeKmeansClusterer(k=2) for sample in samples: oc.add_document(sample) class Test(unittest.TestCase): def test_orange_cluster_term_document_matrix(self): oc.construct_term_doc_matrix() calculated = oc.td_matrix expected = numpy.array([[ 0.31388923, 0.11584717, 0, 0, 0, 0, 0.47083384], [ 0, 0.13515504, 0.3662041, 0, 0.3662041, 0, 0 ], [ 0, 0, 0, 0.54930614, 0, 0.549306140, 0 ]]) self.assertEqual(expected.all(), calculated.all()) def test_orange_save_matrix_to_tab_file(self): oc.construct_term_doc_matrix() oc.save_table("sample_table_orange")
''' import datetime, unittest from database.warehouse import WarehouseServer from analysis.clustering.kmeans import OrangeKmeansClusterer from tools.utils import aggregate_data from matplotlib.dates import num2date#!@UnresolvedImport from visualizations.graphs import D3Timeline ws = WarehouseServer() from_date = datetime.datetime(2011, 1, 26, 0, 0, 0) to_date = datetime.datetime(2011, 1, 27, 0, 0, 0) items = ws.get_documents_by_date(from_date, to_date, limit=3000) oc = OrangeKmeansClusterer(k=100, ngram=1) oc.add_documents(items) oc.run("orange_clustering_test", pca=False) top_clusters = [] for cluster in oc.clusters: documents = cluster.get_documents().values() if len(documents) == 0 : continue dates = [doc.date for doc in documents] delta = max(dates) - min(dates) delta_seconds = delta.total_seconds() if delta_seconds == 0: continue rate_growth = float(len(dates))/delta_seconds top_clusters.append( (rate_growth, max(dates), cluster) ) top_clusters = sorted(top_clusters, key=lambda x: -x[0])[:20]
''' Created on 26 Jan 2012 @author: george ''' import unittest, numpy from analysis.clustering.kmeans import OrangeKmeansClusterer from tests.test_document import get_test_documents ########################################### # GLOBALS # ########################################### ignore, ignore, samples = get_test_documents() oc = OrangeKmeansClusterer(k=2) for sample in samples: oc.add_document(sample) class Test(unittest.TestCase): def test_orange_cluster_term_document_matrix(self): oc.construct_term_doc_matrix() calculated = oc.td_matrix expected = numpy.array( [[0.31388923, 0.11584717, 0, 0, 0, 0, 0.47083384], [0, 0.13515504, 0.3662041, 0, 0.3662041, 0, 0], [0, 0, 0, 0.54930614, 0, 0.549306140, 0]]) self.assertEqual(expected.all(), calculated.all()) def test_orange_save_matrix_to_tab_file(self): oc.construct_term_doc_matrix()
def test_orange_with_tweets_kmeans(self): import time start = time.time() from_date = datetime.datetime(2011, 1, 25, 0, 0, 0) to_date = datetime.datetime(2011, 1, 26, 0, 0, 0) items = ws.get_documents_by_date(from_date, to_date, limit=1000) oc = OrangeKmeansClusterer(k=20, ngram=1) oc.add_documents(items) oc.run("orange_clustering_test", pca=True) print time.time() - start oc.plot_growth_timeline(cumulative=True) oc.plot_scatter() oc.dump_clusters_to_file("kmeans_with_tweets_orange")