Example #1
0
class PersisterTest(unittest.TestCase):
    def setUp(self):
        self.persister = Persister("/tmp/persister_test")

    def save_load(self, message):
        self.persister.save(message)
        return self.persister.load()

    def test_save_load(self):
        message = "Hello World!"
        self.assertEqual(message, self.save_load(message))

    def test_unicode(self):
        message = "ÖkuarneåÅäÄöÖ"
        self.assertEqual(message, self.save_load(message))

    def test_dictionary(self):
        d = {'key': "value"}
        self.assertEqual(d, self.save_load(d))

    def test_attempt_to_load_nonexisting_file_should_return_none(self):
        persister = Persister("/tmp/this_file_does_not_exist")
        self.assertIsNone(persister.load())

    def test_default_return(self):
        default_return = {}
        persister = Persister("/tmp/this_file_does_not_exist")
        self.assertEqual(default_return, persister.load(default_return))
Example #2
0
def main():
    """
    tweet_indexer consumes the output (tweetfile) created by tweet_scanner
    and creates:
    * indexfile: searchable dictionary - {word: [position: url_id]
    * graphfile: each url and their outbound links {url: [list of urls]}
    * url_lookupfile: dictionary containing url ids - {url_id: url}
    """
    try:
        klogger.logger = klogger.get_logger("ketchlip", "indexer.log")

        input_queue = Queue()
        output_queue = Queue()

        base_dir = config.config.base_dir

        tweetfile = base_dir + "tweets.txt" # timestamp \t url
        indexfile = base_dir + "index"
        graphfile = base_dir + "graph"
        url_lookupfile = base_dir + "url_lookup"
        lookup_urlfile = base_dir + "lookup_url"
        since_file = base_dir + "since"

        index_persister = Persister(indexfile)
        graph_persister = Persister(graphfile)
        url_lookup_persister = Persister(url_lookupfile)
        lookup_url_persister = Persister(lookup_urlfile)
        since_persister = Persister(since_file)

        index = index_persister.load({})
        graph = graph_persister.load({})
        lookup_url = lookup_url_persister.load({})
        since = since_persister.load()

        indexer = Indexer()
        indexer.index = index
        indexer.graph = graph
        indexer.lookup_url = lookup_url

        klogger.info("Indexing " + tweetfile)
        if since:
            klogger.info("Since " + str(since))

        url_list = open(tweetfile, "r")
        include_count = 0
        exclude_count = 0
        for timestamp_url in url_list:
            timestamp, url = timestamp_url.split("\t")
            url = url.strip()
            if not url in lookup_url and (not since or since <= timestamp):
                input_queue.put_nowait(url)
                since = timestamp
                include_count += 1
            else:
                exclude_count += 1

        klogger.info("Including: " + str(include_count) + " Excluding: " + str(exclude_count))

        if include_count <= 0:
            klogger.info("Nothting to index")
            return

        # Spawn off crawler and indexer
        gevent.joinall([
            gevent.spawn(Crawler().gevent_crawl, input_queue, output_queue),
            gevent.spawn(indexer.gevent_index, input_queue, output_queue)
        ])

        if not indexer.done:
            return klogger.info("Indexing failed")

        index = indexer.index
        graph = indexer.graph
        url_lookup = indexer.url_lookup
        lookup_url = indexer.lookup_url

        index_persister.save(index)
        graph_persister.save(graph)
        url_lookup_persister.save(url_lookup)
        lookup_url_persister.save(lookup_url)
        since_persister.save(since)

        klogger.info("Saved index in " + indexfile + " (length " + str(len(index)) + ")")
        klogger.info("Saved graph in " + graphfile + " (length " + str(len(graph)) + ")")
        klogger.info("Saved lookup url in " + lookup_urlfile + " (length " + str(len(lookup_url)) + ")")
        klogger.info("Saved url lookup in " + url_lookupfile + " (length " + str(len(url_lookup)) + ")")

        klogger.info("Indexing completed")
    except KeyboardInterrupt:
        klogger.info('^C received, shutting down indexer')
    except Exception, e:
        klogger.exception(e)
Example #3
0
 def setUp(self):
     self.persister = Persister("/tmp/persister_test")
Example #4
0
 def test_default_return(self):
     default_return = {}
     persister = Persister("/tmp/this_file_does_not_exist")
     self.assertEqual(default_return, persister.load(default_return))
Example #5
0
 def test_attempt_to_load_nonexisting_file_should_return_none(self):
     persister = Persister("/tmp/this_file_does_not_exist")
     self.assertIsNone(persister.load())