def get_authors(self): ''' Returns the authors of the documents that appear in this cluster. ''' ws = WarehouseServer() authors = set(ws.get_document_authors(self.document_dict.keys())) return list(authors)
def get_authors(self): ''' Returns the authors of the documents that appear in this cluster. ''' ws = WarehouseServer() authors = set(ws.get_document_authors(self.document_dict.keys())) return list(authors)
def test_author_classification_egypt_dataset(self): TestAuthor.drop_collection() ws = WarehouseServer() for author in [author for author in ws.get_authors(type=Author)]: if len(author.tweets) > 200: t = TestAuthor() t.screen_name = author.screen_name t.tweets = author.tweets t.save() authors = ws.get_authors(type=TestAuthor) for author in authors: print '-----------------------' print author.screen_name vector = author.update_feature_vector() print vector classifier = TreeClassifier() attributes = ["retweets", "links", "retweeted", "replies", "mentions", "ff-ratio", "class"] train_set = numpy.array([author.get_feature_vector_with_type() for author in TrainingAuthor.objects]) classifier.train(train_set, attributes) for author in authors: prediction = "No prediction" if len(author.feature_vector) > 0: prediction = classifier.classify(author.get_feature_vector_with_type()) print author.screen_name print prediction print '----------------------' TestAuthor.drop_collection()
def output_clusters_to_file(clusters, rownames, filename): ''' DEPRECATED This method takes as input a set of clusters and generates a very simplistic representation of these clusters in text form in a file. ''' ws = WarehouseServer() out = file(filename, 'w') out.write("Clustering results") out.write('\n') i = 0 for cluster in clusters: out.write('\n') out.write('***********************************************************') out.write('\n') out.write("Cluster"+str(i)) out.write('\n') for document in cluster: out.write( ws.get_document_by_id(rownames[document]).content) out.write('\n') i += 1
def output_clusters_to_file(clusters, rownames, filename): ''' DEPRECATED This method takes as input a set of clusters and generates a very simplistic representation of these clusters in text form in a file. ''' ws = WarehouseServer() out = file(filename, 'w') out.write("Clustering results") out.write('\n') i = 0 for cluster in clusters: out.write('\n') out.write( '***********************************************************') out.write('\n') out.write("Cluster" + str(i)) out.write('\n') for document in cluster: out.write(ws.get_document_by_id(rownames[document]).content) out.write('\n') i += 1
# -*- coding: utf-8 -*- ''' Created on 23 Jan 2012 @author: george My playground! ''' import unittest, os from analysis.index import Index from database.warehouse import WarehouseServer from database.model.tweets import TwoGroupsTweet BASE_PATH = os.path.expanduser("~/virtualenvfyp/pythia/data/") index_path = BASE_PATH + "test_index" ws = WarehouseServer() sample_docs = ws.get_n_documents(100, type=TwoGroupsTweet) index = Index(index_path) for doc in sample_docs: index.add_document(doc) index.finalize() class TestPlayground(unittest.TestCase): def test_searching(self): results = index.search_by_term("sales") calculated = [] for doc in results: calculated.append(doc.get('id'))
''' Created on 26 Jan 2012 @author: george ''' import unittest, datetime from analysis.clustering.dbscan import DBSCANClusterer from database.warehouse import WarehouseServer from collections import OrderedDict ########################################### # GLOBALS # ########################################### ws = WarehouseServer() epsilon = 2.0 min_pts = 2.0 points = [] points.append([1,1]) points.append([1.5,1]) points.append([1.8,1.5]) points.append([2.1,1]) points.append([3.1,2]) points.append([4.1,2]) points.append([5.1,2]) points.append([10,10]) points.append([11,10.5]) points.append([9.5,11]) points.append([9.9,11.4]) points.append([15.0, 17.0]) points.append([15.0, 17.0]) points.append([7.5, -5.0])
''' Created on 22 Jan 2012 @author: george ''' import datetime, os from crawlers.CrawlerFactory import CrawlerFactory from database.model.tweets import EgyptTweet from analysis.index import Index from database.warehouse import WarehouseServer BASE_PATH = os.path.expanduser("~/virtualenvfyp/pythia/data/") ws = WarehouseServer() index_path = os.path.join(BASE_PATH,"egypt_index") if not os.path.exists(index_path): try: os.makedirs(index_path) except os.error: raise Exception(index_path + " could not be created.") #Save the tweets in the db f = CrawlerFactory() t = f.get_crawler("topsy") search_hashtags = "#25jan OR #jan25 OR #egypt OR #tahrir OR #fuckmubarak OR #mubarak \ OR #suez OR #DownWithMubarak OR #NOSCAF OR #SCAF OR #cairo" t.search_for(search_hashtags) from_date=datetime.datetime(2011, 01, 27, 23, 55, 0) to_date=datetime.datetime(2011, 01, 29, 0, 0, 0) t.search_between(from_date=from_date, to_date=to_date,
''' Created on 21 Mar 2012 @author: george ''' from database.warehouse import WarehouseServer from database.model.tweets import EvaluationTweet from analysis.clustering.kmeans import OrangeKmeansClusterer from evaluation.evaluators import ClusteringEvaluator ws = WarehouseServer() documents = ws.get_all_documents(type=EvaluationTweet) oc = OrangeKmeansClusterer(k=35, ngram=1) ebe = ClusteringEvaluator(documents) bcubed_precision, bcubed_recall, bcubed_f = ebe.evaluate(clusterer=oc) print bcubed_precision, bcubed_recall, bcubed_f
@author: george ''' import datetime from crawlers.CrawlerFactory import CrawlerFactory from database.model.tweets import * from database.model.agents import * from mongoengine import * import tools.utils from urlparse import urlparse from database.warehouse import WarehouseServer f = CrawlerFactory() twitter = f.get_crawler("twitter") #twitter.login() ws = WarehouseServer() from_date = datetime.datetime(2011, 1, 25, 0, 0, 0) to_date = datetime.datetime(2011, 1, 26, 0, 00, 0) items = ws.get_documents_by_date(from_date, to_date, limit=100) screen_names = [] for tweet in items: screen_names.append(tweet.author_screen_name) screen_names = set(screen_names) print len(screen_names) # A terrible hack to save the screen_names of users which are mentioned in tweets # but they are not yet in the database. They'll be considered after all authors have #been stored. mentions_of_not_stored_users = [] for author_name in screen_names:
''' Created on 24 Mar 2012 @author: george ''' import datetime, unittest from database.warehouse import WarehouseServer from analysis.clustering.kmeans import OrangeKmeansClusterer from tools.utils import aggregate_data from matplotlib.dates import num2date#[email protected] from visualizations.graphs import D3Timeline ws = WarehouseServer() from_date = datetime.datetime(2011, 1, 26, 0, 0, 0) to_date = datetime.datetime(2011, 1, 27, 0, 0, 0) items = ws.get_documents_by_date(from_date, to_date, limit=3000) oc = OrangeKmeansClusterer(k=100, ngram=1) oc.add_documents(items) oc.run("orange_clustering_test", pca=False) top_clusters = [] for cluster in oc.clusters: documents = cluster.get_documents().values() if len(documents) == 0 : continue dates = [doc.date for doc in documents] delta = max(dates) - min(dates) delta_seconds = delta.total_seconds() if delta_seconds == 0: continue
# -*- coding: utf-8 -*- ''' Created on 23 Jan 2012 @author: george My playground! ''' import unittest, os from analysis.index import Index from database.warehouse import WarehouseServer from database.model.tweets import TwoGroupsTweet BASE_PATH = os.path.expanduser("~/virtualenvfyp/pythia/data/") index_path = BASE_PATH + "test_index" ws = WarehouseServer() sample_docs = ws.get_n_documents(100, type=TwoGroupsTweet) index = Index(index_path) for doc in sample_docs: index.add_document(doc) index.finalize() class TestPlayground(unittest.TestCase): def test_searching(self): results = index.search_by_term("sales") calculated = [] for doc in results: calculated.append(doc.get('id'))
''' Created on 22 Jan 2012 @author: george ''' import datetime, os from crawlers.CrawlerFactory import CrawlerFactory from database.model.tweets import EgyptTweet from analysis.index import Index from mongoengine import * from database.warehouse import WarehouseServer BASE_PATH = os.path.expanduser("~/virtualenvfyp/pythia/data/") ws = WarehouseServer() index_path = os.path.join(BASE_PATH, "egypt_index") if not os.path.exists(index_path): try: os.makedirs(index_path) except os.error: raise Exception(index_path + " could not be created.") #Save the tweets in the db f = CrawlerFactory() t = f.get_crawler("topsy") search_hashtags = "#25jan OR #jan25 OR #egypt OR #tahrir OR #fuckmubarak OR #mubarak \ OR #suez OR #DownWithMubarak OR #NOSCAF OR #SCAF OR #cairo" t.search_for(search_hashtags) ##Last update ended at 2011-01-27 09:00:00 from_date = datetime.datetime(2011, 01, 24, 0, 0, 0)
''' Created on 22 Mar 2012 @author: george This script allow us to annotate known events with their labels ''' import datetime from database.warehouse import WarehouseServer from mongoengine import connect connect("pythia_db") from evaluation.evaluators import AbstractEvaluator ws = WarehouseServer() from_date=datetime.datetime(2011, 01, 25, 12, 0, 0) to_date=datetime.datetime(2011, 01, 25, 12, 5, 0) tweet_list = ws.get_documents_by_date(from_date, to_date) ce = AbstractEvaluator(tweet_list) ce.annotate_dataset()
''' Created on 21 Mar 2012 @author: george ''' import numpy from database.warehouse import WarehouseServer from analysis.classification.tree import TreeClassifier from database.model.agents import TrainingAuthor from evaluation.evaluators import ClassificationEvaluator ws = WarehouseServer() authors = ws.get_all_documents(type=TrainingAuthor) ce = ClassificationEvaluator(authors, ["Celebrity", "Media", "Journalists", "Activists", "Commoner"]) metrics = ce.evaluate(classifier=TreeClassifier(), K=10) print metrics