Example #1
0
 def _filter_terms(self):
     '''
     Removes tokens that appear either too often or too rarely. It returns the corpus of 
     filtered tokens and also changes the token and word_frequencies lists of each document
     according to the filtered tokens.
     '''
     index = Index("kmeans_index")
     index.add_documents(self.db_documents)
     index.finalize()
     filtered_terms = index.get_filtered_terms(lowestf=0.1, highestf=0.2)
     corpus = []
     for id, document in self.document_dict.iteritems():
         filtered_tokens = []
         for token in document.tokens:
             if token in filtered_terms:
                 filtered_tokens.append(token)
         
         if len(filtered_tokens) == 0: #if all the tokens are removed then this document is worthless
             self.document_dict.pop(id)
         else: #if there are still tokens ammend the token list and the word frequencies list to include only the relevant tokens
             self.document_dict[id].tokens = filtered_tokens
             self.document_dict[id].word_frequencies = [item for item in self.document_dict[id].word_frequencies if item.word in filtered_tokens]
             
         corpus.append(filtered_tokens)
     return corpus
Example #2
0
@author: george

My playground!
'''
import unittest, os
from analysis.index import Index
from database.warehouse import WarehouseServer
from database.model.tweets import TwoGroupsTweet

BASE_PATH = os.path.expanduser("~/virtualenvfyp/pythia/data/")
index_path = BASE_PATH + "test_index"
ws = WarehouseServer()
sample_docs = ws.get_n_documents(100, type=TwoGroupsTweet)

index = Index(index_path)
for doc in sample_docs:
    index.add_document(doc)
index.finalize()

class TestPlayground(unittest.TestCase):
  
    def test_searching(self):        
        results = index.search_by_term("sales")
        
        calculated = []
        for doc in results:
            calculated.append(doc.get('id'))
            
        expected = ['4f2d602780286c38a7000013', '4f2d603280286c38a700001e']
        self.assertEqual(expected, calculated)
    try:
        os.makedirs(index_path)
    except os.error:
        raise Exception(index_path + " could not be created.")  
    
#Save the tweets in the db
f = CrawlerFactory()
t = f.get_crawler("topsy")

search_hashtags = "#25jan OR #jan25 OR #egypt OR #tahrir OR #fuckmubarak OR #mubarak \
                   OR #suez OR #DownWithMubarak OR #NOSCAF OR #SCAF OR #cairo"
t.search_for(search_hashtags)
from_date=datetime.datetime(2011, 01, 27, 23, 55, 0)
to_date=datetime.datetime(2011, 01, 29, 0, 0, 0)
t.search_between(from_date=from_date, 
                 to_date=to_date, 
                 granularity_days=0, 
                 granularity_hours=0, 
                 granularity_mins=5)
t.retrieve_items_of_type(EgyptTweet)
t.crawl(only_english=True)

#Index all the documents
docs = ws.get_documents_by_date(from_date, to_date, type=EgyptTweet)
index = Index(index_path)
print 'Started indexing'
index.add_documents(docs)
index.finalize()
print 'Started indexing'
for term in index.get_top_terms(limit=100):
    print term
Example #4
0
@author: george

My playground!
'''
import unittest, os
from analysis.index import Index
from database.warehouse import WarehouseServer
from database.model.tweets import TwoGroupsTweet

BASE_PATH = os.path.expanduser("~/virtualenvfyp/pythia/data/")
index_path = BASE_PATH + "test_index"
ws = WarehouseServer()
sample_docs = ws.get_n_documents(100, type=TwoGroupsTweet)

index = Index(index_path)
for doc in sample_docs:
    index.add_document(doc)
index.finalize()


class TestPlayground(unittest.TestCase):
    def test_searching(self):
        results = index.search_by_term("sales")

        calculated = []
        for doc in results:
            calculated.append(doc.get('id'))

        expected = ['4f2d602780286c38a7000013', '4f2d603280286c38a700001e']
        self.assertEqual(expected, calculated)
    except os.error:
        raise Exception(index_path + " could not be created.")

#Save the tweets in the db
f = CrawlerFactory()
t = f.get_crawler("topsy")

search_hashtags = "#25jan OR #jan25 OR #egypt OR #tahrir OR #fuckmubarak OR #mubarak \
                   OR #suez OR #DownWithMubarak OR #NOSCAF OR #SCAF OR #cairo"

t.search_for(search_hashtags)
##Last update ended at 2011-01-27 09:00:00
from_date = datetime.datetime(2011, 01, 24, 0, 0, 0)
to_date = datetime.datetime(2011, 01, 25, 0, 0, 0)
t.search_between(from_date=from_date,
                 to_date=to_date,
                 granularity_days=0,
                 granularity_hours=0,
                 granularity_mins=5)
t.retrieve_items_of_type(EgyptTweet)
t.crawl(only_english=True)

#Index all the documents
docs = ws.get_documents_by_date(from_date, to_date, type=EgyptTweet)
index = Index(index_path)
print 'Started indexing'
index.add_documents(docs)
index.finalize()
print 'Started indexing'
for term in index.get_top_terms(limit=100):
    print term