Example #1
0
 def test_add_page_to_index_with_too_long_word(self):
     indexer = Indexer()
     index = {}
     url = "http://dn.se"
     content = "a brown porcupinesporcupinesporcupinesporcupinesporcupinesporcupines"
     indexer.add_page_to_index(index, url, content)
     self.assertEqual({u'a': [[0, 'http://dn.se']], u'brown': [[1, 'http://dn.se']]}, index)
Example #2
0
 def test_add_page_to_index(self):
     indexer = Indexer()
     index = {}
     url = "http://dn.se"
     content = "a brown fox"
     indexer.add_page_to_index(index, url, content)
     self.assertEqual({u'a': [[0, 'http://dn.se']], u'brown': [[1, 'http://dn.se']], u'fox': [[2, 'http://dn.se']]}, index)
Example #3
0
    def test_add_to_index(self):
        indexer = Indexer()
        index = {}
        keyword = "keyword"
        url = "http://dn.se"

        indexer.add_to_index(index, keyword, 42, url)
        self.assertEqual({'keyword': [[42, 'http://dn.se']]}, index)

        indexer.add_to_index(index, keyword, 43, url)
        self.assertEqual({'keyword': [[42, 'http://dn.se'], [43, 'http://dn.se']]}, index)
Example #4
0
    def test_indexing(self):
        # given
        result = {Crawler.CONTENT: '<html><head><title>some title</title><meta name="description" content="some description"/></head><body><p>some text</p></body></html>',
                  Crawler.EXPANDED_URL: "http://expandedurl.com",
                  Crawler.URL: "http://url.com"}

        # when
        indexer = Indexer()
        indexer.indexing(result)

        # then
        expected_index = {u'description': [[3, 0]],
                          u'some': [[0, 0], [2, 0], [4, 0]],
                          u'text': [[5, 0]],
                          u'title': [[1, 0]]}

        self.assertEqual(expected_index, indexer.index)
Example #5
0
def main():
    """
    tweet_indexer consumes the output (tweetfile) created by tweet_scanner
    and creates:
    * indexfile: searchable dictionary - {word: [position: url_id]
    * graphfile: each url and their outbound links {url: [list of urls]}
    * url_lookupfile: dictionary containing url ids - {url_id: url}
    """
    try:
        klogger.logger = klogger.get_logger("ketchlip", "indexer.log")

        input_queue = Queue()
        output_queue = Queue()

        base_dir = config.config.base_dir

        tweetfile = base_dir + "tweets.txt" # timestamp \t url
        indexfile = base_dir + "index"
        graphfile = base_dir + "graph"
        url_lookupfile = base_dir + "url_lookup"
        lookup_urlfile = base_dir + "lookup_url"
        since_file = base_dir + "since"

        index_persister = Persister(indexfile)
        graph_persister = Persister(graphfile)
        url_lookup_persister = Persister(url_lookupfile)
        lookup_url_persister = Persister(lookup_urlfile)
        since_persister = Persister(since_file)

        index = index_persister.load({})
        graph = graph_persister.load({})
        lookup_url = lookup_url_persister.load({})
        since = since_persister.load()

        indexer = Indexer()
        indexer.index = index
        indexer.graph = graph
        indexer.lookup_url = lookup_url

        klogger.info("Indexing " + tweetfile)
        if since:
            klogger.info("Since " + str(since))

        url_list = open(tweetfile, "r")
        include_count = 0
        exclude_count = 0
        for timestamp_url in url_list:
            timestamp, url = timestamp_url.split("\t")
            url = url.strip()
            if not url in lookup_url and (not since or since <= timestamp):
                input_queue.put_nowait(url)
                since = timestamp
                include_count += 1
            else:
                exclude_count += 1

        klogger.info("Including: " + str(include_count) + " Excluding: " + str(exclude_count))

        if include_count <= 0:
            klogger.info("Nothting to index")
            return

        # Spawn off crawler and indexer
        gevent.joinall([
            gevent.spawn(Crawler().gevent_crawl, input_queue, output_queue),
            gevent.spawn(indexer.gevent_index, input_queue, output_queue)
        ])

        if not indexer.done:
            return klogger.info("Indexing failed")

        index = indexer.index
        graph = indexer.graph
        url_lookup = indexer.url_lookup
        lookup_url = indexer.lookup_url

        index_persister.save(index)
        graph_persister.save(graph)
        url_lookup_persister.save(url_lookup)
        lookup_url_persister.save(lookup_url)
        since_persister.save(since)

        klogger.info("Saved index in " + indexfile + " (length " + str(len(index)) + ")")
        klogger.info("Saved graph in " + graphfile + " (length " + str(len(graph)) + ")")
        klogger.info("Saved lookup url in " + lookup_urlfile + " (length " + str(len(lookup_url)) + ")")
        klogger.info("Saved url lookup in " + url_lookupfile + " (length " + str(len(url_lookup)) + ")")

        klogger.info("Indexing completed")
    except KeyboardInterrupt:
        klogger.info('^C received, shutting down indexer')
    except Exception, e:
        klogger.exception(e)