def _filter_terms(self): ''' Removes tokens that appear either too often or too rarely. It returns the corpus of filtered tokens and also changes the token and word_frequencies lists of each document according to the filtered tokens. ''' index = Index("kmeans_index") index.add_documents(self.db_documents) index.finalize() filtered_terms = index.get_filtered_terms(lowestf=0.1, highestf=0.2) corpus = [] for id, document in self.document_dict.iteritems(): filtered_tokens = [] for token in document.tokens: if token in filtered_terms: filtered_tokens.append(token) if len(filtered_tokens) == 0: #if all the tokens are removed then this document is worthless self.document_dict.pop(id) else: #if there are still tokens ammend the token list and the word frequencies list to include only the relevant tokens self.document_dict[id].tokens = filtered_tokens self.document_dict[id].word_frequencies = [item for item in self.document_dict[id].word_frequencies if item.word in filtered_tokens] corpus.append(filtered_tokens) return corpus
try: os.makedirs(index_path) except os.error: raise Exception(index_path + " could not be created.") #Save the tweets in the db f = CrawlerFactory() t = f.get_crawler("topsy") search_hashtags = "#25jan OR #jan25 OR #egypt OR #tahrir OR #fuckmubarak OR #mubarak \ OR #suez OR #DownWithMubarak OR #NOSCAF OR #SCAF OR #cairo" t.search_for(search_hashtags) from_date=datetime.datetime(2011, 01, 27, 23, 55, 0) to_date=datetime.datetime(2011, 01, 29, 0, 0, 0) t.search_between(from_date=from_date, to_date=to_date, granularity_days=0, granularity_hours=0, granularity_mins=5) t.retrieve_items_of_type(EgyptTweet) t.crawl(only_english=True) #Index all the documents docs = ws.get_documents_by_date(from_date, to_date, type=EgyptTweet) index = Index(index_path) print 'Started indexing' index.add_documents(docs) index.finalize() print 'Started indexing' for term in index.get_top_terms(limit=100): print term
except os.error: raise Exception(index_path + " could not be created.") #Save the tweets in the db f = CrawlerFactory() t = f.get_crawler("topsy") search_hashtags = "#25jan OR #jan25 OR #egypt OR #tahrir OR #fuckmubarak OR #mubarak \ OR #suez OR #DownWithMubarak OR #NOSCAF OR #SCAF OR #cairo" t.search_for(search_hashtags) ##Last update ended at 2011-01-27 09:00:00 from_date = datetime.datetime(2011, 01, 24, 0, 0, 0) to_date = datetime.datetime(2011, 01, 25, 0, 0, 0) t.search_between(from_date=from_date, to_date=to_date, granularity_days=0, granularity_hours=0, granularity_mins=5) t.retrieve_items_of_type(EgyptTweet) t.crawl(only_english=True) #Index all the documents docs = ws.get_documents_by_date(from_date, to_date, type=EgyptTweet) index = Index(index_path) print 'Started indexing' index.add_documents(docs) index.finalize() print 'Started indexing' for term in index.get_top_terms(limit=100): print term