Exemple #1
0
    def __init__(self, url, links=[]):
        """Parameters:
            - url: a string
            - links: a list where each element can be:
                         a. A link URL
                         b. A pair made from a link URL and a score between 0 and 1
                     If the first option is used then score is assumed 0.0
        """
        # make sure we keep a reference to the module
        self._c_aduana = C_ADUANA

        self._crawled_page = self._c_aduana.crawled_page_new(url)
        if not self._crawled_page:
            raise PageDBException(
                "Error inside crawled_page_new: returned NULL")

        for pair in links:
            if (isinstance(pair, basestring)):
                url = pair
                score = 0.0
            else:
                url = pair[0]
                score = pair[1]

            ret = self._c_aduana.crawled_page_add_link(
                self._crawled_page,
                url,
                ffi.cast("float", score))
            if ret != 0:
                raise PageDBException(
                    "Error inside crawled_page_add_link: returned %d" % ret)
Exemple #2
0
 def page_info(self, page_hash):
     pi = ffi.new('PageInfo **')
     ret = self._c_aduana.page_db_get_info(
         self._page_db[0], ffi.cast('uint64_t', page_hash), pi)
     if ret != 0:
         raise PageDBException.from_error(self._page_db[0].error)
     return PageInfo(page_hash, pi[0])
Exemple #3
0
    def load(self, freq_iter):
        cur = ffi.new('void **')
        ret = self._c_aduana.freq_scheduler_cursor_open(self._sch[0], cur)
        if ret != 0:
            raise PageDBException.from_error(self._sch[0].error)

        for page_hash, page_freq in freq_iter:
            self._c_aduana.freq_scheduler_cursor_write(
                self._sch[0],
                cur[0],
                ffi.cast('uint64_t', page_hash),
                page_freq
            )
        ret = self._c_aduana.freq_scheduler_cursor_commit(self._sch[0], cur[0])
        if ret != 0:
            raise PageDBException.from_error(self._sch[0].error)
Exemple #4
0
 def hash(self, value):
     ret = self._c_aduana.crawled_page_set_hash64(
         self._crawled_page, ffi.cast('uint64_t', value))
     if ret != 0:
         raise PageDBException(
             "Error inside crawled_page_set_hash64: returned %d" % ret)
Exemple #5
0
 def hash(self):
     ret = None
     phash = ffi.cast('uint64_t *', self._crawled_page.content_hash)
     if phash:
         ret = phash[0]
     return ret