def page_info(self, page_hash): pi = ffi.new('PageInfo **') ret = self._c_aduana.page_db_get_info( self._page_db[0], ffi.cast('uint64_t', page_hash), pi) if ret != 0: raise PageDBException.from_error(self._page_db[0].error) return PageInfo(page_hash, pi[0])
def __init__(self, url, links=[]): """Parameters: - url: a string - links: a list where each element can be: a. A link URL b. A pair made from a link URL and a score between 0 and 1 If the first option is used then score is assumed 0.0 """ # make sure we keep a reference to the module self._c_aduana = C_ADUANA self._crawled_page = self._c_aduana.crawled_page_new(url) if not self._crawled_page: raise PageDBException( "Error inside crawled_page_new: returned NULL") for pair in links: if (isinstance(pair, basestring)): url = pair score = 0.0 else: url = pair[0] score = pair[1] ret = self._c_aduana.crawled_page_add_link( self._crawled_page, url, ffi.cast("float", score)) if ret != 0: raise PageDBException( "Error inside crawled_page_add_link: returned %d" % ret)
def __init__(self, url, links=[]): """Parameters: - url: a string - links: a list where each element can be: a. A link URL b. A pair made from a link URL and a score between 0 and 1 If the first option is used then score is assumed 0.0 """ # make sure we keep a reference to the module self._c_aduana = C_ADUANA self._crawled_page = self._c_aduana.crawled_page_new(url) if not self._crawled_page: raise AduanaException( "Error inside crawled_page_new: returned NULL") for pair in links: if (isinstance(pair, basestring)): url = pair score = 0.0 else: url = pair[0] score = pair[1] ret = self._c_aduana.crawled_page_add_link( self._crawled_page, url, ffi.cast("float", score)) if ret != 0: raise AduanaException( "Error inside crawled_page_add_link: returned %d" % ret)
def page_info(self, page_hash): pi = ffi.new('PageInfo **') ret = self._c_aduana.page_db_get_info( self._page_db[0], ffi.cast('uint64_t', page_hash), pi) if ret != 0: raise AduanaException.from_error(self._page_db[0].error) return PageInfo(page_hash, pi[0])
def load(self, freq_iter): cur = ffi.new('void **') ret = self._c_aduana.freq_scheduler_cursor_open(self._sch[0], cur) if ret != 0: raise PageDBException.from_error(self._sch[0].error) for page_hash, page_freq in freq_iter: self._c_aduana.freq_scheduler_cursor_write( self._sch[0], cur[0], ffi.cast('uint64_t', page_hash), page_freq ) ret = self._c_aduana.freq_scheduler_cursor_commit(self._sch[0], cur[0]) if ret != 0: raise PageDBException.from_error(self._sch[0].error)
def load(self, freq_iter): cur = ffi.new('void **') ret = self._c_aduana.freq_scheduler_cursor_open(self._sch[0], cur) if ret != 0: raise AduanaException.from_error(self._sch[0].error) for page_hash, page_freq in freq_iter: self._c_aduana.freq_scheduler_cursor_write( self._sch[0], cur[0], ffi.cast('uint64_t', page_hash), page_freq ) ret = self._c_aduana.freq_scheduler_cursor_commit(self._sch[0], cur[0]) if ret != 0: raise AduanaException.from_error(self._sch[0].error)
def hash(self, value): ret = self._c_aduana.crawled_page_set_hash64( self._crawled_page, ffi.cast('uint64_t', value)) if ret != 0: raise PageDBException( "Error inside crawled_page_set_hash64: returned %d" % ret)
def hash(self): ret = None phash = ffi.cast('uint64_t *', self._crawled_page.content_hash) if phash: ret = phash[0] return ret
def hash(self, value): ret = self._c_aduana.crawled_page_set_hash64( self._crawled_page, ffi.cast('uint64_t', value)) if ret != 0: raise AduanaException( "Error inside crawled_page_set_hash64: returned %d" % ret)