try: os.makedirs(index_path) except os.error: raise Exception(index_path + " could not be created.") #Save the tweets in the db f = CrawlerFactory() t = f.get_crawler("topsy") search_hashtags = "#25jan OR #jan25 OR #egypt OR #tahrir OR #fuckmubarak OR #mubarak \ OR #suez OR #DownWithMubarak OR #NOSCAF OR #SCAF OR #cairo" t.search_for(search_hashtags) from_date=datetime.datetime(2011, 01, 27, 23, 55, 0) to_date=datetime.datetime(2011, 01, 29, 0, 0, 0) t.search_between(from_date=from_date, to_date=to_date, granularity_days=0, granularity_hours=0, granularity_mins=5) t.retrieve_items_of_type(EgyptTweet) t.crawl(only_english=True) #Index all the documents docs = ws.get_documents_by_date(from_date, to_date, type=EgyptTweet) index = Index(index_path) print 'Started indexing' index.add_documents(docs) index.finalize() print 'Started indexing' for term in index.get_top_terms(limit=100): print term
from crawlers.CrawlerFactory import CrawlerFactory from database.model.tweets import * from database.model.agents import * from mongoengine import * import tools.utils from urlparse import urlparse from database.warehouse import WarehouseServer f = CrawlerFactory() twitter = f.get_crawler("twitter") #twitter.login() ws = WarehouseServer() from_date = datetime.datetime(2011, 1, 25, 0, 0, 0) to_date = datetime.datetime(2011, 1, 26, 0, 00, 0) items = ws.get_documents_by_date(from_date, to_date, limit=100) screen_names = [] for tweet in items: screen_names.append(tweet.author_screen_name) screen_names = set(screen_names) print len(screen_names) # A terrible hack to save the screen_names of users which are mentioned in tweets # but they are not yet in the database. They'll be considered after all authors have #been stored. mentions_of_not_stored_users = [] for author_name in screen_names: author = Author.objects(screen_name=author_name) if len(author) == 0: #If not in db yet tweets = EgyptTweet.objects(author_screen_name=author_name) author = Author()
except os.error: raise Exception(index_path + " could not be created.") #Save the tweets in the db f = CrawlerFactory() t = f.get_crawler("topsy") search_hashtags = "#25jan OR #jan25 OR #egypt OR #tahrir OR #fuckmubarak OR #mubarak \ OR #suez OR #DownWithMubarak OR #NOSCAF OR #SCAF OR #cairo" t.search_for(search_hashtags) ##Last update ended at 2011-01-27 09:00:00 from_date = datetime.datetime(2011, 01, 24, 0, 0, 0) to_date = datetime.datetime(2011, 01, 25, 0, 0, 0) t.search_between(from_date=from_date, to_date=to_date, granularity_days=0, granularity_hours=0, granularity_mins=5) t.retrieve_items_of_type(EgyptTweet) t.crawl(only_english=True) #Index all the documents docs = ws.get_documents_by_date(from_date, to_date, type=EgyptTweet) index = Index(index_path) print 'Started indexing' index.add_documents(docs) index.finalize() print 'Started indexing' for term in index.get_top_terms(limit=100): print term
@author: george ''' import datetime, unittest from database.warehouse import WarehouseServer from analysis.clustering.kmeans import OrangeKmeansClusterer from tools.utils import aggregate_data from matplotlib.dates import num2date#!@UnresolvedImport from visualizations.graphs import D3Timeline ws = WarehouseServer() from_date = datetime.datetime(2011, 1, 26, 0, 0, 0) to_date = datetime.datetime(2011, 1, 27, 0, 0, 0) items = ws.get_documents_by_date(from_date, to_date, limit=3000) oc = OrangeKmeansClusterer(k=100, ngram=1) oc.add_documents(items) oc.run("orange_clustering_test", pca=False) top_clusters = [] for cluster in oc.clusters: documents = cluster.get_documents().values() if len(documents) == 0 : continue dates = [doc.date for doc in documents] delta = max(dates) - min(dates) delta_seconds = delta.total_seconds() if delta_seconds == 0: continue rate_growth = float(len(dates))/delta_seconds top_clusters.append( (rate_growth, max(dates), cluster) )
''' Created on 22 Mar 2012 @author: george This script allow us to annotate known events with their labels ''' import datetime from database.warehouse import WarehouseServer from mongoengine import connect connect("pythia_db") from evaluation.evaluators import AbstractEvaluator ws = WarehouseServer() from_date=datetime.datetime(2011, 01, 25, 12, 0, 0) to_date=datetime.datetime(2011, 01, 25, 12, 5, 0) tweet_list = ws.get_documents_by_date(from_date, to_date) ce = AbstractEvaluator(tweet_list) ce.annotate_dataset()