def test_sample_doc_clustering_with_online(self): oc = OnlineClusterer(N=2, window=3) samples = get_orange_clustering_test_data() for document in samples: index = oc.add_document(document) oc.cluster(document) expected = [0, 0, 0, 1, 1, 1] for cluster in oc.clusters: print cluster.document_dict
def test_cluster_term_document_matrix(self): oc = OnlineClusterer(N=2, window=3) for document in samples: index = oc.add_document(document) oc.cluster(document) calculated = oc.td_matrix expected = numpy.array( [[0.31388923, 0.11584717, 0, 0, 0, 0, 0.47083384], [0, 0.13515504, 0.3662041, 0, 0.3662041, 0, 0], [0, 0, 0, 0.54930614, 0, 0.549306140, 0]]) self.assertEqual(expected.all(), calculated.all())
def test_cluster_term_document_matrix(self): oc = OnlineClusterer(N=2, window=3) for document in samples: index = oc.add_document(document) oc.cluster(document) calculated = oc.td_matrix expected = numpy.array([[ 0.31388923, 0.11584717, 0, 0, 0, 0, 0.47083384], [ 0, 0.13515504, 0.3662041, 0, 0.3662041, 0, 0 ], [ 0, 0, 0, 0.54930614, 0, 0.549306140, 0 ]]) self.assertEqual(expected.all(), calculated.all())
def test_online_clustering_with_tweets(self): from_date = datetime.datetime(2011, 1, 25, 0, 0, 0) to_date = datetime.datetime(2011, 1, 26, 0, 00, 0) items = ws.get_top_documents_by_date(from_date, to_date, threshold=1000) window = 300 oc = OnlineClusterer(N=50, window = window) for item in items: oc.cluster(item) clusters=oc.trimclusters() oc.dump_clusters_to_file("online_with_tweets") oc.plot_scatter() oc.plot_growth_timeline(cumulative=True) for cluster in oc.clusters: print cluster.id print cluster.get_size() print '-----------------'
def test_online_clustering_with_tweets(self): from_date = datetime.datetime(2011, 1, 25, 0, 0, 0) to_date = datetime.datetime(2011, 1, 26, 0, 00, 0) items = ws.get_documents_by_date(from_date, to_date, limit=200) window = 100 oc = OnlineClusterer(N=50, window = window) for item in items: oc.cluster(item) clusters=oc.trimclusters() oc.dump_clusters_to_file("online_with_tweets") #oc.plot_scatter() #oc.plot_growth_timeline(cumulative=True) for cluster in oc.clusters: sorted = cluster.summarize() for doc in sorted: print doc.dist, doc.raw print '--------------------'
def test_online_clustering_with_tweets(self): from_date = datetime.datetime(2011, 1, 25, 0, 0, 0) to_date = datetime.datetime(2011, 1, 26, 0, 00, 0) items = ws.get_top_documents_by_date(from_date, to_date, threshold=1000) window = 300 oc = OnlineClusterer(N=50, window=window) for item in items: oc.cluster(item) clusters = oc.trimclusters() oc.dump_clusters_to_file("online_with_tweets") oc.plot_scatter() oc.plot_growth_timeline(cumulative=True) for cluster in oc.clusters: print cluster.id print cluster.get_size() print '-----------------'