class PersisterTest(unittest.TestCase): def setUp(self): self.persister = Persister("/tmp/persister_test") def save_load(self, message): self.persister.save(message) return self.persister.load() def test_save_load(self): message = "Hello World!" self.assertEqual(message, self.save_load(message)) def test_unicode(self): message = "ÖkuarneåÅäÄöÖ" self.assertEqual(message, self.save_load(message)) def test_dictionary(self): d = {'key': "value"} self.assertEqual(d, self.save_load(d)) def test_attempt_to_load_nonexisting_file_should_return_none(self): persister = Persister("/tmp/this_file_does_not_exist") self.assertIsNone(persister.load()) def test_default_return(self): default_return = {} persister = Persister("/tmp/this_file_does_not_exist") self.assertEqual(default_return, persister.load(default_return))
def main(): """ tweet_indexer consumes the output (tweetfile) created by tweet_scanner and creates: * indexfile: searchable dictionary - {word: [position: url_id] * graphfile: each url and their outbound links {url: [list of urls]} * url_lookupfile: dictionary containing url ids - {url_id: url} """ try: klogger.logger = klogger.get_logger("ketchlip", "indexer.log") input_queue = Queue() output_queue = Queue() base_dir = config.config.base_dir tweetfile = base_dir + "tweets.txt" # timestamp \t url indexfile = base_dir + "index" graphfile = base_dir + "graph" url_lookupfile = base_dir + "url_lookup" lookup_urlfile = base_dir + "lookup_url" since_file = base_dir + "since" index_persister = Persister(indexfile) graph_persister = Persister(graphfile) url_lookup_persister = Persister(url_lookupfile) lookup_url_persister = Persister(lookup_urlfile) since_persister = Persister(since_file) index = index_persister.load({}) graph = graph_persister.load({}) lookup_url = lookup_url_persister.load({}) since = since_persister.load() indexer = Indexer() indexer.index = index indexer.graph = graph indexer.lookup_url = lookup_url klogger.info("Indexing " + tweetfile) if since: klogger.info("Since " + str(since)) url_list = open(tweetfile, "r") include_count = 0 exclude_count = 0 for timestamp_url in url_list: timestamp, url = timestamp_url.split("\t") url = url.strip() if not url in lookup_url and (not since or since <= timestamp): input_queue.put_nowait(url) since = timestamp include_count += 1 else: exclude_count += 1 klogger.info("Including: " + str(include_count) + " Excluding: " + str(exclude_count)) if include_count <= 0: klogger.info("Nothting to index") return # Spawn off crawler and indexer gevent.joinall([ gevent.spawn(Crawler().gevent_crawl, input_queue, output_queue), gevent.spawn(indexer.gevent_index, input_queue, output_queue) ]) if not indexer.done: return klogger.info("Indexing failed") index = indexer.index graph = indexer.graph url_lookup = indexer.url_lookup lookup_url = indexer.lookup_url index_persister.save(index) graph_persister.save(graph) url_lookup_persister.save(url_lookup) lookup_url_persister.save(lookup_url) since_persister.save(since) klogger.info("Saved index in " + indexfile + " (length " + str(len(index)) + ")") klogger.info("Saved graph in " + graphfile + " (length " + str(len(graph)) + ")") klogger.info("Saved lookup url in " + lookup_urlfile + " (length " + str(len(lookup_url)) + ")") klogger.info("Saved url lookup in " + url_lookupfile + " (length " + str(len(url_lookup)) + ")") klogger.info("Indexing completed") except KeyboardInterrupt: klogger.info('^C received, shutting down indexer') except Exception, e: klogger.exception(e)
def test_default_return(self): default_return = {} persister = Persister("/tmp/this_file_does_not_exist") self.assertEqual(default_return, persister.load(default_return))
def test_attempt_to_load_nonexisting_file_should_return_none(self): persister = Persister("/tmp/this_file_does_not_exist") self.assertIsNone(persister.load())