def _filter_terms(self): ''' Removes tokens that appear either too often or too rarely. It returns the corpus of filtered tokens and also changes the token and word_frequencies lists of each document according to the filtered tokens. ''' index = Index("kmeans_index") index.add_documents(self.db_documents) index.finalize() filtered_terms = index.get_filtered_terms(lowestf=0.1, highestf=0.2) corpus = [] for id, document in self.document_dict.iteritems(): filtered_tokens = [] for token in document.tokens: if token in filtered_terms: filtered_tokens.append(token) if len(filtered_tokens) == 0: #if all the tokens are removed then this document is worthless self.document_dict.pop(id) else: #if there are still tokens ammend the token list and the word frequencies list to include only the relevant tokens self.document_dict[id].tokens = filtered_tokens self.document_dict[id].word_frequencies = [item for item in self.document_dict[id].word_frequencies if item.word in filtered_tokens] corpus.append(filtered_tokens) return corpus
@author: george My playground! ''' import unittest, os from analysis.index import Index from database.warehouse import WarehouseServer from database.model.tweets import TwoGroupsTweet BASE_PATH = os.path.expanduser("~/virtualenvfyp/pythia/data/") index_path = BASE_PATH + "test_index" ws = WarehouseServer() sample_docs = ws.get_n_documents(100, type=TwoGroupsTweet) index = Index(index_path) for doc in sample_docs: index.add_document(doc) index.finalize() class TestPlayground(unittest.TestCase): def test_searching(self): results = index.search_by_term("sales") calculated = [] for doc in results: calculated.append(doc.get('id')) expected = ['4f2d602780286c38a7000013', '4f2d603280286c38a700001e'] self.assertEqual(expected, calculated)
try: os.makedirs(index_path) except os.error: raise Exception(index_path + " could not be created.") #Save the tweets in the db f = CrawlerFactory() t = f.get_crawler("topsy") search_hashtags = "#25jan OR #jan25 OR #egypt OR #tahrir OR #fuckmubarak OR #mubarak \ OR #suez OR #DownWithMubarak OR #NOSCAF OR #SCAF OR #cairo" t.search_for(search_hashtags) from_date=datetime.datetime(2011, 01, 27, 23, 55, 0) to_date=datetime.datetime(2011, 01, 29, 0, 0, 0) t.search_between(from_date=from_date, to_date=to_date, granularity_days=0, granularity_hours=0, granularity_mins=5) t.retrieve_items_of_type(EgyptTweet) t.crawl(only_english=True) #Index all the documents docs = ws.get_documents_by_date(from_date, to_date, type=EgyptTweet) index = Index(index_path) print 'Started indexing' index.add_documents(docs) index.finalize() print 'Started indexing' for term in index.get_top_terms(limit=100): print term
except os.error: raise Exception(index_path + " could not be created.") #Save the tweets in the db f = CrawlerFactory() t = f.get_crawler("topsy") search_hashtags = "#25jan OR #jan25 OR #egypt OR #tahrir OR #fuckmubarak OR #mubarak \ OR #suez OR #DownWithMubarak OR #NOSCAF OR #SCAF OR #cairo" t.search_for(search_hashtags) ##Last update ended at 2011-01-27 09:00:00 from_date = datetime.datetime(2011, 01, 24, 0, 0, 0) to_date = datetime.datetime(2011, 01, 25, 0, 0, 0) t.search_between(from_date=from_date, to_date=to_date, granularity_days=0, granularity_hours=0, granularity_mins=5) t.retrieve_items_of_type(EgyptTweet) t.crawl(only_english=True) #Index all the documents docs = ws.get_documents_by_date(from_date, to_date, type=EgyptTweet) index = Index(index_path) print 'Started indexing' index.add_documents(docs) index.finalize() print 'Started indexing' for term in index.get_top_terms(limit=100): print term