def test_callback(self): counter = [0] first_key = 'a' first_value = 1 def callback(key, value): self.assertEqual(key, first_key) self.assertEqual(value, first_value) counter[0] += 1 l = LRU(1, callback=callback) l[first_key] = first_value l['b'] = 1 # test calling the callback self.assertEqual(counter[0], 1) self.assertEqual(l.keys(), ['b']) l['b'] = 2 # doesn't call callback self.assertEqual(counter[0], 1) self.assertEqual(l.keys(), ['b']) self.assertEqual(l.values(), [2]) l = LRU(1, callback=callback) l[first_key] = first_value l.set_callback(None) l['c'] = 1 # doesn't call callback self.assertEqual(counter[0], 1) self.assertEqual(l.keys(), ['c']) l.set_callback(callback) del l['c'] # doesn't call callback self.assertEqual(l.keys(), [])
class topic4: def __init__(self, c_hash, c_user, c_words): self.topic_count =1 self.l1 = LRU(c_hash) self.l2 = LRU(c_user) def set_hashLRU(self,l): self.set(self.l1, l) def set_userLRU(self,l): self.set(self.l2, l) def set(self, lru, l): for k in l: v = lru.get(k,0) lru[k]=v+1 def set_cluster(self, hashtags, users, words): for k in hashtags: self.l1[k]=self.l1.get(k,0)+1 for k in users: self.l2[k]=self.l2.get(k,0)+1 self.topic_count+=1 def get_similarity(self,hashtags,users,words): h_sum = 1 u_sum = 1 w_sum = 1 h_match =0 h_ind =0 u_ind =0 w_ind =0 c=0 h1 = self.l1.get_size() u1 = self.l2.get_size() for h in hashtags: # l1_items=zip(*self.l1.items()) h_sum+= self.l1.get(h,0) if(self.l1.has_key(h)): ind = self.l1.keys().index(h) h_ind+= h1 - ind h_match+= 1 if ind<250 else 0 for u in users: u_sum+= self.l2.get(u,0) if(self.l2.has_key(u)): u_ind+= u1 - self.l2.keys().index(u) if(h_match !=0): c = h_match -1 # print(h_ind,h1,u_ind,u1,w_ind,w1, h_sum,w_sum,) similarity = (h_ind/(h1+1))*(h_sum/sum(self.l1.values() +[1])) + (u_ind/(u1+1))*(u_sum/sum(self.l2.values()+[1])) +c return similarity
def test_lru(self): l = LRU(1) l['a'] = 1 l['a'] self.assertEqual(l.keys(), ['a']) l['b'] = 2 self.assertEqual(l.keys(), ['b']) l = LRU(2) l['a'] = 1 l['b'] = 2 self.assertEqual(len(l), 2) l['a'] # Testing the first one l['c'] = 3 self.assertEqual(sorted(l.keys()), ['a', 'c']) l['c'] self.assertEqual(sorted(l.keys()), ['a', 'c']) l = LRU(3) l['a'] = 1 l['b'] = 2 l['c'] = 3 self.assertEqual(len(l), 3) l['b'] # Testing the middle one l['d'] = 4 self.assertEqual(sorted(l.keys()), ['b', 'c', 'd']) l['d'] # Testing the last one self.assertEqual(sorted(l.keys()), ['b', 'c', 'd']) l['e'] = 5 self.assertEqual(sorted(l.keys()), ['b', 'd', 'e'])
def test_lru(self): l = LRU(1) l["a"] = 1 l["a"] self.assertEqual(l.keys(), ["a"]) l["b"] = 2 self.assertEqual(l.keys(), ["b"]) l = LRU(2) l["a"] = 1 l["b"] = 2 self.assertEqual(len(l), 2) l["a"] # Testing the first one l["c"] = 3 self.assertEqual(sorted(l.keys()), ["a", "c"]) l["c"] self.assertEqual(sorted(l.keys()), ["a", "c"]) l = LRU(3) l["a"] = 1 l["b"] = 2 l["c"] = 3 self.assertEqual(len(l), 3) l["b"] # Testing the middle one l["d"] = 4 self.assertEqual(sorted(l.keys()), ["b", "c", "d"]) l["d"] # Testing the last one self.assertEqual(sorted(l.keys()), ["b", "c", "d"]) l["e"] = 5 self.assertEqual(sorted(l.keys()), ["b", "d", "e"])
def test_bench_with_original(benchmark, data, collector): m = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss c1 = LRU(2000) benchmark.pedantic(run_cache, args=(c1, data), iterations=1, rounds=100) hits, misses = c1.get_stats() items = len(c1.keys()) del c1 gc.collect() collector( dict(hits=hits, misses=misses, items=items, memory=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss - m))
class FCP(BaseTask): def __init__(self, circle, src, dest, treewalk=None, totalsize=0, hostcnt=0, prune=False, verify=False, resume=False, workq=None): BaseTask.__init__(self, circle) self.circle = circle self.treewalk = treewalk self.totalsize = totalsize self.prune = prune self.workq = workq self.resume = resume self.checkpoint_file = None self.checkpoint_db = None self.src = src self.dest = os.path.abspath(dest) # cache, keep the size conservative # TODO: we need a more portable LRU size if hostcnt != 0: max_ofile, _ = resource.getrlimit(resource.RLIMIT_NOFILE) procs_per_host = self.circle.size / hostcnt self._read_cache_limit = ((max_ofile - 64) / procs_per_host) / 3 self._write_cache_limit = ((max_ofile - 64) / procs_per_host) * 2 / 3 if self._read_cache_limit <= 0 or self._write_cache_limit <= 0: self._read_cache_limit = 1 self._write_cache_limit = 8 self.rfd_cache = LRU(self._read_cache_limit) self.wfd_cache = LRU(self._write_cache_limit) self.cnt_filesize_prior = 0 self.cnt_filesize = 0 self.blocksize = 1024 * 1024 self.chunksize = 1024 * 1024 # debug self.d = {"rank": "rank %s" % circle.rank} self.wtime_started = MPI.Wtime() self.wtime_ended = None self.workcnt = 0 # this is the cnt for the enqued items self.reduce_items = 0 # this is the cnt for processed items if self.treewalk: log.debug("treewalk files = %s" % treewalk.flist, extra=self.d) # fini_check self.fini_cnt = Counter() # verify self.verify = verify self.use_store = False if self.verify: self.chunksums_mem = [] self.chunksums_buf = [] # checkpointing self.checkpoint_interval = sys.maxsize self.checkpoint_last = MPI.Wtime() if self.circle.rank == 0: print("Start copying process ...") def rw_cache_limit(self): return (self._read_cache_limit, self._write_cache_limit) def set_fixed_chunksize(self, sz): self.chunksize = sz def set_adaptive_chunksize(self, totalsz): self.chunksize = utils.calc_chunksize(totalsz) if self.circle.rank == 0: print("Adaptive chunksize: %s" % bytes_fmt(self.chunksize)) def cleanup(self): for f in self.rfd_cache.values(): try: os.close(f) except OSError as e: pass for f in self.wfd_cache.values(): try: os.close(f) except OSError as e: pass # remove checkpoint file if self.checkpoint_file and os.path.exists(self.checkpoint_file): os.remove(self.checkpoint_file) if self.checkpoint_db and os.path.exists(self.checkpoint_db): os.remove(self.checkpoint_db) # remove provided checkpoint file if G.resume and G.chk_file and os.path.exists(G.chk_file): os.remove(G.chk_file) if G.resume and G.chk_file_db and os.path.exists(G.chk_file_db): os.remove(G.chk_file_db) # remove chunksums file if self.verify: if hasattr(self, "chunksums_db"): self.chunksums_db.cleanup() # we need to do this because if last job didn't finish cleanly # the fwalk files can be found as leftovers # and if fcp cleanup has a chance, it should clean up that """ fwalk = "%s/fwalk.%s" % (G.tempdir, self.circle.rank) if os.path.exists(fwalk): os.remove(fwalk) """ def new_fchunk(self, fitem): fchunk = FileChunk() # default cmd = copy fchunk.src = fitem.path fchunk.dest = destpath(fitem, self.dest) return fchunk def enq_file(self, fi): """ Process a single file, represented by "fi" - FileItem It involves chunking this file and equeue all chunks. """ chunks = fi.st_size / self.chunksize remaining = fi.st_size % self.chunksize workcnt = 0 if fi.st_size == 0: # empty file fchunk = self.new_fchunk(fi) fchunk.offset = 0 fchunk.length = 0 self.enq(fchunk) workcnt += 1 else: for i in range(chunks): fchunk = self.new_fchunk(fi) fchunk.offset = i * self.chunksize fchunk.length = self.chunksize self.enq(fchunk) workcnt += chunks if remaining > 0: # send remainder fchunk = self.new_fchunk(fi) fchunk.offset = chunks * self.chunksize fchunk.length = remaining self.enq(fchunk) workcnt += 1 # save work cnt self.workcnt += workcnt log.debug("enq_file(): %s, size = %s, workcnt = %s" % (fi.path, fi.st_size, workcnt), extra=self.d) def handle_fitem(self, fi): if os.path.islink(fi.path): dest = destpath(fi, self.dest) linkto = os.readlink(fi.path) try: os.symlink(linkto, dest) except Exception as e: log.debug("%s, skipping sym link %s" % (e, fi.path), extra=self.d) elif stat.S_ISREG(fi.st_mode): self.enq_file(fi) # where chunking takes place def create(self): """ Each task has one create(), which is invoked by circle ONCE. For FCP, each task will handle_fitem() -> enq_file() to process each file gathered during the treewalk stage. """ if not G.use_store and self.workq: # restart self.setq(self.workq) return if self.resume: return # construct and enable all copy operations # we batch operation hard-coded log.info("create() starts, flist length = %s" % len(self.treewalk.flist), extra=self.d) # flist in memory if len(self.treewalk.flist) > 0: for fi in self.treewalk.flist: self.handle_fitem(fi) # flist in buf if len(self.treewalk.flist_buf) > 0: for fi in self.treewalk.flist_buf: self.handle_fitem(fi) # flist in database if self.treewalk.use_store: while self.treewalk.flist_db.qsize > 0: fitems, _ = self.treewalk.flist_db.mget(G.DB_BUFSIZE) for fi in fitems: self.handle_fitem(fi) self.treewalk.flist_db.mdel(G.DB_BUFSIZE) # both memory and databse checkpoint if self.checkpoint_file: self.do_no_interrupt_checkpoint() self.checkpoint_last = MPI.Wtime() # gather total_chunks self.circle.comm.barrier() G.total_chunks = self.circle.comm.allreduce(self.workcnt, op=MPI.SUM) #G.total_chunks = self.circle.comm.bcast(G.total_chunks) #print("Total chunks: ",G.total_chunks) def do_open(self, k, d, flag, limit): """ @param k: the file path @param d: dictionary of <path, file descriptor> @return: file descriptor """ if d.has_key(k): return d[k] if len(d.keys()) >= limit: # over the limit # clean up the least used old_k, old_v = d.items()[-1] try: os.close(old_v) except OSError as e: log.warn("FD for %s not valid when closing" % old_k, extra=self.d) fd = -1 try: fd = os.open(k, flag) except OSError as e: if e.errno == 28: # no space left log.error("Critical error: %s, exit!" % e, extra=self.d) self.circle.exit(0) # should abort else: log.error("OSError({0}):{1}, skipping {2}".format(e.errno, e.strerror, k), extra=self.d) else: if fd > 0: d[k] = fd finally: return fd @staticmethod def do_mkdir(work): src = work.src dest = work.dest if not os.path.exists(dest): os.makedirs(dest) def do_copy(self, work): src = work.src dest = work.dest basedir = os.path.dirname(dest) if not os.path.exists(basedir): os.makedirs(basedir) rfd = self.do_open(src, self.rfd_cache, os.O_RDONLY, self._read_cache_limit) if rfd < 0: return False wfd = self.do_open(dest, self.wfd_cache, os.O_WRONLY | os.O_CREAT, self._write_cache_limit) if wfd < 0: if args.force: try: os.unlink(dest) except OSError as e: log.error("Failed to unlink %s, %s " % (dest, e), extra=self.d) return False else: wfd = self.do_open(dest, self.wfd_cache, os.O_WRONLY, self._write_cache_limit) else: log.error("Failed to create output file %s" % dest, extra=self.d) return False # do the actual copy self.write_bytes(rfd, wfd, work) # update tally self.cnt_filesize += work.length if G.verbosity > 2: log.debug("Transferred %s bytes from:\n\t [%s] to [%s]" % (self.cnt_filesize, src, dest), extra=self.d) return True def do_no_interrupt_checkpoint(self): a = Thread(target=self.do_checkpoint) a.start() a.join() log.debug("checkpoint: %s" % self.checkpoint_file, extra=self.d) print("\nMake checkpoint files: ", self.checkpoint_file) def do_checkpoint(self): # when make checkpoint, first write workq and workq_buf into checkpoint file, then make a copy of workq_db if it exists for k in self.wfd_cache.keys(): os.close(self.wfd_cache[k]) # clear the cache self.wfd_cache.clear() tmp_file = self.checkpoint_file + ".part" with open(tmp_file, "wb") as f: self.circle.workq.extend(self.circle.workq_buf) self.circle.workq_buf.clear() cobj = Checkpoint(self.src, self.dest, self.get_workq(), self.totalsize) pickle.dump(cobj, f, pickle.HIGHEST_PROTOCOL) # POSIX requires rename to be atomic os.rename(tmp_file, self.checkpoint_file) # copy workq_db database file if hasattr(self.circle, "workq_db") and len(self.circle.workq_db) > 0: self.checkpoint_db = self.checkpoint_file + ".db" if not G.resume: shutil.copy2(self.circle.dbname, self.checkpoint_db) else: # in resume mode, make a copy of current workq db file, which is provided checkpoint db file self.workdir = os.getcwd() existingCheckpoint = os.path.join(self.workdir,".pcp_workq.%s.%s.db" % (G.rid, self.circle.rank)) shutil.copy2(existingCheckpoint,self.checkpoint_db) def process(self): """ The only work is "copy" TODO: clean up other actions such as mkdir/fini_check """ if not G.use_store: curtime = MPI.Wtime() if curtime - self.checkpoint_last > self.checkpoint_interval: self.do_no_interrupt_checkpoint() log.info("Checkpointing done ...", extra=self.d) self.checkpoint_last = curtime work = self.deq() self.reduce_items += 1 if isinstance(work, FileChunk): self.do_copy(work) else: log.warn("Unknown work object: %s" % work, extra=self.d) err_and_exit("Not a correct workq format") def reduce_init(self, buf): buf['cnt_filesize'] = self.cnt_filesize if sys.platform == 'darwin': buf['mem_snapshot'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss else: buf['mem_snapshot'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024 def reduce(self, buf1, buf2): buf1['cnt_filesize'] += buf2['cnt_filesize'] buf1['mem_snapshot'] += buf2['mem_snapshot'] return buf1 def reduce_report(self, buf): out = "" if self.totalsize != 0: out += "%.2f %% finished, " % (100 * float(buf['cnt_filesize']) / self.totalsize) out += "%s copied" % bytes_fmt(buf['cnt_filesize']) if self.circle.reduce_time_interval != 0: rate = float(buf['cnt_filesize'] - self.cnt_filesize_prior) / self.circle.reduce_time_interval self.cnt_filesize_prior = buf['cnt_filesize'] out += ", estimated transfer rate: %s/s" % bytes_fmt(rate) out += ", memory usage: %s" % bytes_fmt(buf['mem_snapshot']) print(out) def reduce_finish(self, buf): # self.reduce_report(buf) pass def epilogue(self): global taskloads self.wtime_ended = MPI.Wtime() taskloads = self.circle.comm.gather(self.reduce_items) if self.circle.rank == 0: if self.totalsize == 0: print("\nZero filesize detected, done.\n") return tlapse = self.wtime_ended - self.wtime_started rate = float(self.totalsize) / tlapse print("\nFCP Epilogue:\n") print("\t{:<20}{:<20}".format("Ending at:", utils.current_time())) print("\t{:<20}{:<20}".format("Completed in:", utils.conv_time(tlapse))) print("\t{:<20}{:<20}".format("Transfer Rate:", "%s/s" % bytes_fmt(rate))) print("\t{:<20}{:<20}".format("Use store chunksums:", "%s" % self.use_store)) print("\t{:<20}{:<20}".format("Use store workq:", "%s" % self.circle.use_store)) print("\t{:<20}{:<20}".format("FCP Loads:", "%s" % taskloads)) def read_then_write(self, rfd, wfd, work, num_of_bytes, m): """ core entry point for copy action: first read then write. @param num_of_bytes: the exact amount of bytes we will copy @return: False if unsuccessful. """ buf = None try: buf = readn(rfd, num_of_bytes) except IOError: self.logger.error("Failed to read %s", work.src, extra=self.d) return False try: writen(wfd, buf) except IOError: self.logger.error("Failed to write %s", work.dest, extra=self.d) return False if m: m.update(buf) return True def write_bytes(self, rfd, wfd, work): os.lseek(rfd, work.offset, os.SEEK_SET) os.lseek(wfd, work.offset, os.SEEK_SET) m = None if self.verify: m = hashlib.sha1() remaining = work.length while remaining != 0: if remaining >= self.blocksize: self.read_then_write(rfd, wfd, work, self.blocksize, m) remaining -= self.blocksize else: self.read_then_write(rfd, wfd, work, remaining, m) remaining = 0 if self.verify: # use src path here ck = ChunkSum(work.dest, offset=work.offset, length=work.length, digest=m.hexdigest()) if len(self.chunksums_mem) < G.memitem_threshold: self.chunksums_mem.append(ck) else: self.chunksums_buf.append(ck) if len(self.chunksums_buf) == G.DB_BUFSIZE: if self.use_store == False: self.workdir = os.getcwd() self.chunksums_dbname = "%s/chunksums.%s" % (G.tempdir, self.circle.rank) self.chunksums_db = DbStore(dbname=self.chunksums_dbname) self.use_store = True self.chunksums_db.mput(self.chunksums_buf) del self.chunksums_buf[:]
def test_empty(self): l = LRU(1) self.assertEquals([], l.keys()) self.assertEquals([], l.values())
class Cache: """Class representing D3N.""" # Replacement policies LRU = "LRU" LFU = "LFU" LRU_S = "LRU_S" FIFO = "FIFO" RAND = "RAND" # Write policies WRITE_BACK = "WB" WRITE_THROUGH = "WT" # Layer L1 = "L1" L2 = "L2" consistent = "consistent" rendezvous = "rendezvous" rr = "rr" def __init__(self, layer, size, replace_pol, write_pol, hash_ring, hash_type, obj_size, full_size, logger): self._replace_pol = replace_pol # Replacement policy self._write_pol = write_pol # Write policy self._layer = layer # Layer info self._size = size # Cache size self.spaceLeft = size # Cache size self._logger = logger self.hashmap = {} # Mapping self.hash_ring = hash_ring self._hash_type = hash_type self._obj_size = obj_size if (self._size == 0): self.zerosize = True self._size = 1 else: self.zerosize = False if (self._replace_pol == Cache.LRU): self.cache = LRU(self._size) elif (self._replace_pol == Cache.FIFO): self.cache = deque() elif (self._replace_pol == Cache.LRU_S): self.cache = LRU(self._size) self.shadow = LRU(full_size) self.hist = [] for i in range(full_size): self.hist.append(0) # Statistics self._hit_count = 0 self._miss_count = 0 self._backend_bw = 0 self._crossrack_bw = 0 self._intrarack_bw = 0 self.miss_lat = 0 self.lat_count = 0 def _insert1(self, key, size): # No eviction if not self.zerosize: if (self._replace_pol == Cache.LRU_S): self.shadow[key] = 1 if (int(size) <= self.spaceLeft): if (self._replace_pol == Cache.LRU): self.cache[key] = int(size) elif (self._replace_pol == Cache.LRU_S): self.cache[key] = int(size) elif (self._replace_pol == Cache.FIFO): self.cache.append(key) self.hashmap[key] = int(size) self.spaceLeft -= int(size) else: while (int(size) > self.spaceLeft): self._evict() if (self._replace_pol == Cache.LRU): self.cache[key] = int(size) elif (self._replace_pol == Cache.LRU_S): self.cache[key] = int(size) elif (self._replace_pol == Cache.FIFO): self.cache.append(key) self.hashmap[key] = int(size) self.spaceLeft -= int(size) def _insert(self, key, size): # No eviction if not self.zerosize: if (self._replace_pol == Cache.LRU_S): self.cache[key] = int(size) self.shadow[key] = int(size) elif (self._replace_pol == Cache.LRU): self.cache[key] = int(size) else: if (int(size) <= self.spaceLeft): if (self._replace_pol == Cache.LRU): self.cache[key] = int(size) elif (self._replace_pol == Cache.LRU_S): self.cache[key] = int(size) elif (self._replace_pol == Cache.FIFO): self.cache.append(key) self.hashmap[key] = int(size) self.spaceLeft -= int(size) else: while (int(size) > self.spaceLeft): self._evict() if (self._replace_pol == Cache.LRU): self.cache[key] = int(size) elif (self._replace_pol == Cache.LRU_S): self.cache[key] = int(size) elif (self._replace_pol == Cache.FIFO): self.cache.append(key) self.hashmap[key] = int(size) self.spaceLeft -= int(size) def read1(self, key, size): if self._layer == "BE": return 1 if self.zerosize == True: return None """Read a object from the cache.""" r = None if (self._replace_pol == Cache.LRU_S): if self.shadow.has_key(key): count = 0 for i in self.shadow.keys(): if i == key: self.hist[count] += 1 break count += 1 self.shadow[key] = 1 if key in self.hashmap: if (self._replace_pol == Cache.LRU): self._update_use(key) elif (self._replace_pol == Cache.LRU_S): self._update_use(key) self._hit_count += 1 r = 1 else: self._miss_count += 1 return r def read(self, key, size): if self._layer == "BE": return 1 if self.zerosize == True: return None """Read a object from the cache.""" r = None if (self._replace_pol == Cache.LRU_S): if self.cache.has_key(key): self._hit_count += 1 self.cache[key] = self.cache[key] r = 1 else: self._miss_count += 1 if self.shadow.has_key(key): count = 0 for i in self.shadow.keys(): if i == key: self.hist[count] += 1 break count += 1 self.shadow[key] = 1 else: if key in self.hashmap: if (self._replace_pol == Cache.LRU): self._update_use(key) elif (self._replace_pol == Cache.LRU_S): self._update_use(key) self._hit_count += 1 r = 1 else: self._miss_count += 1 return r def checkKey(self, key): if self._layer == "BE": return 1 if self.zerosize == True: return 0 """Read a object from the cache.""" r = 0 if (self._replace_pol == Cache.LRU_S) or (self._replace_pol == Cache.LRU): if self.cache.has_key(key): r = 1 else: r = 0 return r def _evict(self): if (self._replace_pol == Cache.LRU): id = self.cache.peek_last_item()[0] del self.cache[id] elif (self._replace_pol == Cache.LRU_S): id = self.cache.peek_last_item()[0] del self.cache[id] elif (self._replace_pol == Cache.FIFO): id = self.cache.popleft() self.spaceLeft += int(self.hashmap[id]) del self.hashmap[id] def _update_use(self, key): """Update the use of a cache.""" if (self._replace_pol == Cache.LRU): self.cache[key] = self.hashmap[key] if (self._replace_pol == Cache.LRU_S): self.cache[key] = self.hashmap[key] def set_cache_size(self, size): new_size = self.cache.get_size() + int(size) self.cache.set_size(int(new_size)) def set_backend_bw(self, value): self._backend_bw += value def set_crossrack_bw(self, value): self._crossrack_bw += value def set_intrarack_bw(self, value): self._intrarack_bw += value def get_backend_bw(self): return self._backend_bw def get_crossrack_bw(self): return self._crossrack_bw def get_intrarack_bw(self): return self._intrarack_bw def get_replace_pol(self): return self._replace_pol def get_hit_count(self): return self._hit_count def get_miss_count(self): return self._miss_count def get_available_space(self): return self.spaceLeft def get_replace_poll(self): return self._replace_pol def reset_shadow_cache(): self.shadow.clear() def print_cache(self): print self.cache def get_l2_address(self, key): if (self._hash_type == Cache.consistent): return self.hash_ring.get_node(key) elif (self._hash_type == Cache.rendezvous): return self.hash_ring.find_node(key) elif (self._hash_type == Cache.rr): val = key.split("_")[1] res = int(val) % int(self.hash_ring) return res
def test_empty(self): l = LRU(1) self.assertEqual([], l.keys()) self.assertEqual([], l.values())
class AccountDB(AccountDatabaseAPI): logger = get_extended_debug_logger('eth.db.account.AccountDB') def __init__(self, db: AtomicDatabaseAPI, state_root: Hash32 = BLANK_ROOT_HASH) -> None: r""" Internal implementation details (subject to rapid change): Database entries go through several pipes, like so... .. code:: db > _batchdb ---------------------------> _journaldb ----------------> code lookups \ -> _batchtrie -> _trie -> _trie_cache -> _journaltrie --------------> account lookups Journaling sequesters writes at the _journal* attrs ^, until persist is called. _batchtrie enables us to prune all trie changes while building state, without deleting old trie roots. _batchdb and _batchtrie together enable us to make the state root, without saving everything to the database. _journaldb is a journaling of the keys and values used to store code and account storage. _trie is a hash-trie, used to generate the state root _trie_cache is a cache tied to the state root of the trie. It is important that this cache is checked *after* looking for the key in _journaltrie, because the cache is only invalidated after a state root change. _journaltrie is a journaling of the accounts (an address->rlp mapping, rather than the nodes stored by the trie). This enables a squashing of all account changes before pushing them into the trie. .. NOTE:: StorageDB works similarly AccountDB synchronizes the snapshot/revert/persist of both of the journals. """ self._raw_store_db = KeyAccessLoggerAtomicDB(db, log_missing_keys=False) self._batchdb = BatchDB(self._raw_store_db) self._batchtrie = BatchDB(self._raw_store_db, read_through_deletes=True) self._journaldb = JournalDB(self._batchdb) self._trie = HashTrie( HexaryTrie(self._batchtrie, state_root, prune=True)) self._trie_logger = KeyAccessLoggerDB(self._trie, log_missing_keys=False) self._trie_cache = CacheDB(self._trie_logger) self._journaltrie = JournalDB(self._trie_cache) self._account_cache = LRU(2048) self._account_stores: Dict[Address, AccountStorageDatabaseAPI] = {} self._dirty_accounts: Set[Address] = set() self._root_hash_at_last_persist = state_root self._accessed_accounts: Set[Address] = set() self._accessed_bytecodes: Set[Address] = set() @property def state_root(self) -> Hash32: return self._trie.root_hash @state_root.setter def state_root(self, value: Hash32) -> None: if self._trie.root_hash != value: self._trie_cache.reset_cache() self._trie.root_hash = value def has_root(self, state_root: bytes) -> bool: return state_root in self._batchtrie # # Storage # def get_storage(self, address: Address, slot: int, from_journal: bool = True) -> int: validate_canonical_address(address, title="Storage Address") validate_uint256(slot, title="Storage Slot") account_store = self._get_address_store(address) return account_store.get(slot, from_journal) def set_storage(self, address: Address, slot: int, value: int) -> None: validate_uint256(value, title="Storage Value") validate_uint256(slot, title="Storage Slot") validate_canonical_address(address, title="Storage Address") account_store = self._get_address_store(address) self._dirty_accounts.add(address) account_store.set(slot, value) def delete_storage(self, address: Address) -> None: validate_canonical_address(address, title="Storage Address") self._set_storage_root(address, BLANK_ROOT_HASH) self._wipe_storage(address) def _wipe_storage(self, address: Address) -> None: """ Wipe out the storage, without explicitly handling the storage root update """ account_store = self._get_address_store(address) self._dirty_accounts.add(address) account_store.delete() def _get_address_store(self, address: Address) -> AccountStorageDatabaseAPI: if address in self._account_stores: store = self._account_stores[address] else: storage_root = self._get_storage_root(address) store = AccountStorageDB(self._raw_store_db, storage_root, address) self._account_stores[address] = store return store def _dirty_account_stores( self) -> Iterable[Tuple[Address, AccountStorageDatabaseAPI]]: for address in self._dirty_accounts: store = self._account_stores[address] yield address, store @to_tuple def _get_changed_roots(self) -> Iterable[Tuple[Address, Hash32]]: # list all the accounts that were changed, and their new storage roots for address, store in self._dirty_account_stores(): if store.has_changed_root: yield address, store.get_changed_root() def _get_storage_root(self, address: Address) -> Hash32: account = self._get_account(address) return account.storage_root def _set_storage_root(self, address: Address, new_storage_root: Hash32) -> None: account = self._get_account(address) self._set_account(address, account.copy(storage_root=new_storage_root)) def _validate_flushed_storage(self, address: Address, store: AccountStorageDatabaseAPI) -> None: if store.has_changed_root: actual_storage_root = self._get_storage_root(address) expected_storage_root = store.get_changed_root() if expected_storage_root != actual_storage_root: raise ValidationError( "Storage root was not saved to account before trying to persist roots. " f"Account {address!r} had storage {actual_storage_root!r}, " f"but should be {expected_storage_root!r}.") # # Balance # def get_balance(self, address: Address) -> int: validate_canonical_address(address, title="Storage Address") account = self._get_account(address) return account.balance def set_balance(self, address: Address, balance: int) -> None: validate_canonical_address(address, title="Storage Address") validate_uint256(balance, title="Account Balance") account = self._get_account(address) self._set_account(address, account.copy(balance=balance)) # # Nonce # def get_nonce(self, address: Address) -> int: validate_canonical_address(address, title="Storage Address") account = self._get_account(address) return account.nonce def set_nonce(self, address: Address, nonce: int) -> None: validate_canonical_address(address, title="Storage Address") validate_uint256(nonce, title="Nonce") account = self._get_account(address) self._set_account(address, account.copy(nonce=nonce)) def increment_nonce(self, address: Address) -> None: current_nonce = self.get_nonce(address) self.set_nonce(address, current_nonce + 1) # # Code # def get_code(self, address: Address) -> bytes: validate_canonical_address(address, title="Storage Address") code_hash = self.get_code_hash(address) if code_hash == EMPTY_SHA3: return b'' else: try: return self._journaldb[code_hash] except KeyError: raise MissingBytecode(code_hash) #from KeyError finally: if code_hash in self._get_accessed_node_hashes(): self._accessed_bytecodes.add(address) def set_code(self, address: Address, code: bytes) -> None: validate_canonical_address(address, title="Storage Address") validate_is_bytes(code, title="Code") account = self._get_account(address) code_hash = keccak(code) self._journaldb[code_hash] = code self._set_account(address, account.copy(code_hash=code_hash)) def get_code_hash(self, address: Address) -> Hash32: validate_canonical_address(address, title="Storage Address") account = self._get_account(address) return account.code_hash def delete_code(self, address: Address) -> None: validate_canonical_address(address, title="Storage Address") account = self._get_account(address) self._set_account(address, account.copy(code_hash=EMPTY_SHA3)) # # Account Methods # def account_has_code_or_nonce(self, address: Address) -> bool: return self.get_nonce(address) != 0 or self.get_code_hash( address) != EMPTY_SHA3 def delete_account(self, address: Address) -> None: validate_canonical_address(address, title="Storage Address") # We must wipe the storage first, because if it's the first time we load it, # then we want to load it with the original storage root hash, not the # empty one. (in case of a later revert, we don't want to poison the storage cache) self._wipe_storage(address) if address in self._account_cache: del self._account_cache[address] del self._journaltrie[address] def account_exists(self, address: Address) -> bool: validate_canonical_address(address, title="Storage Address") account_rlp = self._get_encoded_account(address, from_journal=True) return account_rlp != b'' def touch_account(self, address: Address) -> None: validate_canonical_address(address, title="Storage Address") account = self._get_account(address) self._set_account(address, account) def account_is_empty(self, address: Address) -> bool: return not self.account_has_code_or_nonce( address) and self.get_balance(address) == 0 # # Internal # def _get_encoded_account(self, address: Address, from_journal: bool = True) -> bytes: self._accessed_accounts.add(address) lookup_trie = self._journaltrie if from_journal else self._trie_cache try: return lookup_trie[address] except trie_exceptions.MissingTrieNode as exc: raise MissingAccountTrieNode(*exc.args) from exc except KeyError: # In case the account is deleted in the JournalDB return b'' def _get_account(self, address: Address, from_journal: bool = True) -> Account: if from_journal and address in self._account_cache.keys(): return self._account_cache[address] rlp_account = self._get_encoded_account(address, from_journal) if rlp_account: account = rlp.decode(rlp_account, sedes=Account) else: account = Account() if from_journal: self._account_cache[address] = account return account def _set_account(self, address: Address, account: Account) -> None: self._account_cache[address] = account rlp_account = rlp.encode(account, sedes=Account) self._journaltrie[address] = rlp_account # # Record and discard API # def record(self) -> JournalDBCheckpoint: checkpoint = self._journaldb.record() self._journaltrie.record(checkpoint) for _, store in self._dirty_account_stores(): store.record(checkpoint) return checkpoint def discard(self, checkpoint: JournalDBCheckpoint) -> None: self._journaldb.discard(checkpoint) self._journaltrie.discard(checkpoint) self._account_cache.clear() for _, store in self._dirty_account_stores(): store.discard(checkpoint) def commit(self, checkpoint: JournalDBCheckpoint) -> None: self._journaldb.commit(checkpoint) self._journaltrie.commit(checkpoint) for _, store in self._dirty_account_stores(): store.commit(checkpoint) def lock_changes(self) -> None: for _, store in self._dirty_account_stores(): store.lock_changes() def make_state_root(self) -> Hash32: for _, store in self._dirty_account_stores(): store.make_storage_root() for address, storage_root in self._get_changed_roots(): if self.account_exists(address) or storage_root != BLANK_ROOT_HASH: self._set_storage_root(address, storage_root) self._journaldb.persist() diff = self._journaltrie.diff() if diff.deleted_keys() or diff.pending_items(): # In addition to squashing (which is redundant here), this context manager causes # an atomic commit of the changes, so exceptions will revert the trie with self._trie.squash_changes() as memory_trie: self._apply_account_diff_without_proof(diff, memory_trie) self._journaltrie.reset() self._trie_cache.reset_cache() return self.state_root def persist(self) -> MetaWitnessAPI: self.make_state_root() # persist storage with self._raw_store_db.atomic_batch() as write_batch: for address, store in self._dirty_account_stores(): self._validate_flushed_storage(address, store) store.persist(write_batch) for address, new_root in self._get_changed_roots(): if new_root is None: raise ValidationError( f"Cannot validate new root of account 0x{address.hex()} " f"which has a new root hash of None") elif new_root not in self._raw_store_db and new_root != BLANK_ROOT_HASH: raise ValidationError( "After persisting storage trie, a root node was not found. " f"State root for account 0x{address.hex()} " f"is missing for hash 0x{new_root.hex()}.") # generate witness (copy) before clearing the underlying data meta_witness = self._get_meta_witness() # reset local storage trackers self._account_stores = {} self._dirty_accounts = set() self._accessed_accounts = set() self._accessed_bytecodes = set() # We have to clear the account cache here so that future account accesses # will get added to _accessed_accounts correctly. Account accesses that # are cached do not add the address to the list of accessed accounts. self._account_cache.clear() # persist accounts self._validate_generated_root() new_root_hash = self.state_root with self._raw_store_db.atomic_batch() as write_batch: self._batchtrie.commit_to(write_batch, apply_deletes=False) self._batchdb.commit_to(write_batch, apply_deletes=False) self._root_hash_at_last_persist = new_root_hash return meta_witness def _get_accessed_node_hashes(self) -> Set[Hash32]: return cast(Set[Hash32], self._raw_store_db.keys_read) @to_dict def _get_access_list( self) -> Iterable[Tuple[Address, AccountQueryTracker]]: """ Get the list of addresses that were accessed, whether the bytecode was accessed, and which storage slots were accessed. """ for address in self._accessed_accounts: did_access_bytecode = address in self._accessed_bytecodes if address in self._account_stores: accessed_storage_slots = self._account_stores[ address].get_accessed_slots() else: accessed_storage_slots = frozenset() yield address, AccountQueryTracker(did_access_bytecode, accessed_storage_slots) def _get_meta_witness(self) -> MetaWitness: """ Get a variety of metadata about the state witness needed to execute the block. This creates a copy, so that underlying changes do not affect the returned MetaWitness. """ return MetaWitness(self._get_accessed_node_hashes(), self._get_access_list()) def _validate_generated_root(self) -> None: db_diff = self._journaldb.diff() if len(db_diff): raise ValidationError( f"AccountDB had a dirty db when it needed to be clean: {db_diff!r}" ) trie_diff = self._journaltrie.diff() if len(trie_diff): raise ValidationError( f"AccountDB had a dirty trie when it needed to be clean: {trie_diff!r}" ) def _apply_account_diff_without_proof(self, diff: DBDiff, trie: DatabaseAPI) -> None: """ Apply diff of trie updates, when original nodes might be missing. Note that doing this naively will raise exceptions about missing nodes from *intermediate* trie roots. This captures exceptions and uses the previous trie root hash that will be recognized by other nodes. """ # It's fairly common that when an account is deleted, we need to retrieve nodes # for accounts that were not needed during normal execution. We only need these # nodes to refactor the trie. for delete_key in diff.deleted_keys(): try: del trie[delete_key] except trie_exceptions.MissingTrieNode as exc: raise MissingAccountTrieNode( exc.missing_node_hash, self._root_hash_at_last_persist, exc.requested_key, ) from exc # It's fairly unusual, but possible, that setting an account will need unknown # nodes during a trie refactor. Here is an example that seems to cause it: # # Setup: # - Root node is a branch, with 0 pointing to a leaf # - The complete leaf key is (0, 1, 2), so (1, 2) is in the leaf node # - We know the leaf node hash but not the leaf node body # Refactor that triggers missing node: # - Add value with key (0, 3, 4) # - We need to replace the current leaf node with a branch that points leaves at 1 and 3 # - The leaf for key (0, 1, 2) now contains only the (2) part, so needs to be rebuilt # - We need the full body of the old (1, 2) leaf node, to rebuild for key, val in diff.pending_items(): try: trie[key] = val except trie_exceptions.MissingTrieNode as exc: raise MissingAccountTrieNode( exc.missing_node_hash, self._root_hash_at_last_persist, exc.requested_key, ) from exc
class DND: def __init__(self, kernel, num_neighbors, max_memory, embedding_size): # self.dictionary = LRUCache(max_memory) # self.kd_tree = kdtree.create(dimensions=embedding_size) # rnd_projection = RandomBinaryProjections("RBP", 8) # distance = EuclideanDistance() # nearest = NearestFilter(num_neighbors) # self.nearpy = Engine(dim=embedding_size, lshashes=[rnd_projection], distance=distance, vector_filters=[nearest], fetch_vector_filters=[]) self.kd_tree = None # self.data = [] # self.lshash = LSHash(hash_size=embedding_size, input_dim=embedding_size, num_hashtables=10) self.lru = LRU(size=max_memory) self.num_neighbors = num_neighbors self.kernel = kernel self.max_memory = max_memory self.embedding_size = embedding_size # self.keys_added = [] def is_present(self, key): return tuple(key) in self.lru # self.lru.has_key(tuple(key)) # return self.dictionary.get(tuple(key)) is not None # return self.dictionary.get(tuple(key.data.cpu().numpy()[0])) is not None def get_value(self, key): return self.lru[tuple(key)] # return self.dictionary.get(tuple(key)) # return self.dictionary.get(tuple(key.data.cpu().numpy()[0])) def lookup(self, lookup_key): # TODO: Speed up search knn # keys = [key[0].data for key in self.kd_tree.search_knn(lookup_key, self.num_neighbors)] lookup_key_numpy = lookup_key.data[0].numpy() # lookup_key_tuple = tuple(lookup_key_numpy) # print(lookup_key) # keys = [key[0] for key in self.lshash.query_no_data(lookup_key_numpy, num_results=self.num_neighbors)] # keys = [key[1] for key in self.nearpy.neighbours(lookup_key_numpy)] if self.kd_tree is not None: # print(len(self.lru.keys()), lookup_key_numpy) # things_distances, things_index = self.kd_tree.query(lookup_key_numpy, k=self.num_neighbors, eps=1.0) things_index = self.kd_tree.query([lookup_key_numpy], k=min(self.num_neighbors, len(self.kd_tree.data)), return_distance=False, sort_results=False) # print(things_index) keys = [self.lru.keys()[ii[0]] for ii in things_index] # print(keys) else: keys = [] # print(keys) # print(keys) # output, kernel_sum = Variable(FloatTensor([0])), Variable(FloatTensor([0])) output, kernel_sum = 0, 0 # if len(keys) != 0: # print(keys) # TODO: Speed this up since the kernel takes a significant amount of time for key in keys: # print("Key:",key, lookup_key) # if not np.allclose(key, lookup_key_numpy): #(key == lookup_key).data.all(): if not np.all(key == lookup_key_numpy): # print("Here") # gg = Variable(FloatTensor(np.array(key))) # print(key) # gg = Variable(FloatTensor(key)) gg = Variable(torch.from_numpy(np.array(key))) # print(tuple(key)) # hh = lookup_key[0] - gg # print("Key:", gg, "Lookup key", lookup_key[0]) # print(lookup_key[0] + gg) kernel_val = self.kernel(gg, lookup_key[0]) # print("key:", self.lru.get(tuple(key))) # if not self.lru.has_key(tuple(key)): # print(keys) # print(tuple(key)) # print(key in self.keys_added) # print(len(self.lru)) # if tuple(key) not in self.lru: # print("NOT IN:", tuple(key)) # print(len(keys)) output += kernel_val * self.lru.get(tuple(key)) # output += kernel_val * self.dictionary.get(tuple(key)) # print("Key", key.requires_grad, key.volatile) # print("Kernel key", self.kernel(key, lookup_key).requires_grad) # print("Output in loop", output.requires_grad) kernel_sum += kernel_val #self.kernel(key, lookup_key) # print(kernel_sum) # if len(keys) == 0: # return (lookup_key * 0)[0][0] if isinstance(kernel_sum, int): return (lookup_key * 0)[0][0] # if kernel_sum == 0: # print("0 Kernel", kernel_sum) # if len(keys) == 0: # print("0 keys", len(keys)) if kernel_sum.data[0] == 0 or len(keys) == 0: # print(lookup_key) # zeroed = (lookup_key * 0)[0][0] # print("Zero Lookup.", output.data, kernel_sum.data, len(keys)) return (lookup_key * 0)[0][0] # print("lookup_key", lookup_key.requires_grad, lookup_key.volatile) # print("kernled", self.kernel(keys[0], lookup_key).requires_grad) # print("output", output.requires_grad, output.volatile) # print("ks", kernel_sum.requires_grad, kernel_sum.volatile) # print("Non-Zero Lookup for {}".format(lookup_key)) output = output / kernel_sum # print(output) return output def upsert(self, key, value): # key = key.data[0].numpy() # print(key) # self.keys_added.append(key) # if not self.lru.has_key(tuple(key)):# self.is_present(key): # self.kd_tree.add(key) # print("Key going in", key) # self.lshash.index(input_point=key) # self.nearpy.store_vector(key, data=key) # print("Adding", tuple(key), key) # neighbours = self.nearpy.neighbours(key) # print(neighbours) self.lru[tuple(key)] = value # self.kd_tree = KDTree(data=self.lru.keys(), compact_nodes=False, copy_data=False, balanced_tree=False) self.kd_tree = KDTree(self.lru.keys()) return if len(self.lru) == self.max_memory: # Expel least recently used key from self.dictionary and self.kd_tree if memory used is at capacity # deleted_key = self.dictionary.delete_least_recently_used()[0] # deleted_key = self.lru.peek_last_item()[0] # print("Deleted key:",deleted_key) # deleted_key = np.array(deleted_key) # thing = Variable(torch.from_numpy(deleted_key).float()).unsqueeze(0) # thing = Variable(FloatTensor(deleted_key)).unsqueeze(0) # print("Thing:",thing) # print(self.dictionary.cache.keys()) key_to_delete = self.lru.peek_last_item() self.lru[tuple(key)] = value # self.kd_tree.remove(Variable(FloatTensor(deleted_key)).unsqueeze(0)) # self.kd_tree.remove(deleted_key) # Remake the LSHASH with the deleted key # print("remaking") # self.lshash = LSHash(hash_size=self.embedding_size, input_dim=self.embedding_size) # for k in self.lru.keys(): # self.lshash.index(np.array(k)) # print("Deleting", np.array(key_to_delete[0])) # self.nearpy.delete_vector(key_to_delete[0]) # self.nearpy.clean_all_buckets() # for k in self.lru.keys(): # self.nearpy.store_vector(np.array(k)) # Checking that the lru keys are the same as the keys in the lshash # for key in self.lru.keys(): # keys_close = [key[0] for key in self.lshash.query(key, num_results=5)] # # print(keys_close) # for kk in keys_close: # if kk not in self.lru: # print("\n\nProblems! Key in LSHASH not in LRU\n\n") # Check length of all lru keys # all_lru_keys = self.lshash.query(key) # print("\n", len(all_lru_keys), "\n") else: self.lru[tuple(key)] = value self.kdtree = KDTree(self.data)
class Manager(object): def __init__(self): ''' ''' self._views = LRU(50) # tile cache - enough for 1 MFOV for 10 parallel users self._tiles = LRU(61 * 10) self._client_tiles = {} def start(self): ''' ''' pass def check_path_type(self, data_path): ''' Check whether the data_path is a scan, section or fov. ''' # we should check how many levels deep is the IMAGE_COORDINATES_FILE # level 0: this is a FOV # level 1: this is a section # level 2: this is a scan if os.path.exists( os.path.join( data_path, settings.IMAGE_COORDINATES_FILE)): return 'FOV' if os.path.exists( os.path.join( data_path, Util.get_first_level_subdir(data_path), settings.IMAGE_COORDINATES_FILE)): return 'SECTION' if os.path.exists( os.path.join( data_path, Util.get_second_level_subdir(data_path), settings.IMAGE_COORDINATES_FILE)): return 'SCAN' return None def get_tree(self, data_path): ''' ''' if not data_path: data_path = settings.DEFAULT_DATA_FOLDER dir_content = sorted(Util.listdir(data_path)) dir_listing = [] for c in dir_content: full_url = os.path.join(data_path, c) # if not os.path.isdir(full_url): # continue entry = {} entry['label'] = c entry['full_url'] = full_url entry['id'] = os.path.join(data_path, c) entry['load_on_demand'] = True dir_listing.append(entry) return dir_listing def get_content(self, data_path): ''' Sends the content listing for a given path. This detects if the path is scan, section or fov. ''' views = [] path_type = self.check_path_type(data_path) # detect if this is a scan, section or fov if path_type == 'FOV': views.append({'data_path': data_path}) elif path_type == 'SECTION': views.append({'data_path': data_path}) elif path_type == 'SCAN': scan = Scan.from_directory(data_path, False) # lazy indexing for i, section in enumerate(scan._sections): views.append( {'data_path': os.path.join(data_path, section.id)}) return views def get_meta_info(self, data_path): ''' Get meta information for a requested data path. ''' if data_path not in self._views.keys(): path_type = self.check_path_type(data_path) # detect if this is a section or fov if path_type == 'FOV': # this is a FoV fov = FoV.from_directory(data_path, True) view = View.create( data_path, [fov], fov._width, fov._height, fov._tx, fov._ty, self) elif path_type == 'SECTION': section = Section.from_directory(data_path, True, True) view = View.create( data_path, section._fovs, section._width, section._height, section._tx, section._ty, self, section._luts64_map) # # and add to our views dictionary # self._views[data_path] = view else: view = self._views[data_path] meta_info = {} meta_info['width'] = view._width meta_info['height'] = view._height meta_info['layer'] = 0 meta_info['minLevel'] = 0 meta_info['maxLevel'] = 1 meta_info['tileSize'] = settings.CLIENT_TILE_SIZE meta_info['centers'] = view._centers return meta_info def get_image(self, data_path, x, y, z, w): ''' Calculate which file(s) we need for the current openseadragon tile and load them as well as downsample them on the fly. ''' # print '-'*80 # print 'SD', data_path, x, y, z, w if settings.CACHE_CLIENT_TILES: osd_file_url = (data_path.replace('/', '_') + '_' + str(x) + '_' + str(y) + '_' + str(z) + '_' + str(w) + '.jpg') osd_file_url_full = os.path.join( settings.CLIENT_TILE_CACHE_FOLDER, osd_file_url) if os.path.exists(osd_file_url_full): # we have this OSD tile cached on disk # print 'OSD CACHE HIT' osd_tile = cv2.imread(osd_file_url_full, 0) return cv2.imencode('.jpg', osd_tile)[1].tostring() view = self._views[data_path] # Create an empty dictionary for the View's luts64_map, if there isn't a map luts64_map = dict() if view._luts64_map is not None: luts64_map = view._luts64_map # calculate canvas coordinates x_c = x * settings.CLIENT_TILE_SIZE y_c = y * settings.CLIENT_TILE_SIZE w_c = settings.CLIENT_TILE_SIZE h_c = settings.CLIENT_TILE_SIZE top_left = [x_c, y_c] bottom_right = [x_c + w_c, y_c + h_c] # loop through all tiles and find ones which match the x_c, y_c, w_c, # h_c bounding box required_tiles = {} for t in view._tiles: tile_dict = view._tiles[t] tile = tile_dict['tile'] # now the normalized coordinates which should match the coordinate # system tx = tile_dict['tx'] / 2**w ty = tile_dict['ty'] / 2**w width = tile_dict['width'] / 2**w height = tile_dict['height'] / 2**w t_top_left = [tx, ty] t_bottom_right = [tx + width, ty + height] comp0 = top_left[0] < t_bottom_right[0] comp1 = bottom_right[0] > t_top_left[0] comp2 = top_left[1] < t_bottom_right[1] comp3 = bottom_right[1] > t_top_left[1] overlapping = comp0 and comp1 and comp2 and comp3 if overlapping: required_tiles[t] = tile_dict stitched_w = min(view._width / 2**w - x_c, settings.CLIENT_TILE_SIZE) stitched_h = min(view._height / 2**w - y_c, settings.CLIENT_TILE_SIZE) stitched = np.zeros((stitched_h, stitched_w), dtype=np.uint8) if settings.INVERT: stitched[:] = 255 # sort the required tiles to always give priority in the same order required_tiles_keys = sorted( required_tiles, key=lambda key: required_tiles[key]) for t in required_tiles_keys: tile_dict = required_tiles[t] tile = tile_dict['tile'] # fov paths need to be treated differently if self.check_path_type(data_path) != 'FOV': t_abs_data_path = os.path.join(data_path, tile_dict['fov']) else: t_abs_data_path = data_path # print 'LOADING', os.path.join(t_abs_data_path, tile._filename) if t in self._tiles.keys() and w in self._tiles[t]: current_tile = self._tiles[t][w] # print 'CACHE HIT' else: # # we add to cache # # print "Loading lut64_map of: {} --> {}".format(tile.id, luts64_map.get(os.path.split(tile.id)[-1].lower(), None)) tile_img = tile.load(t_abs_data_path, settings.IMAGE_PREFIX, lut_base64=luts64_map.get(os.path.split(tile.id)[-1].lower(), None)) current_tile = Manager.downsample_image(tile_img, 2**w) self._tiles[t] = {w: current_tile} # stitch it in our little openseadragon tile tx = tile_dict['tx'] / 2**w ty = tile_dict['ty'] / 2**w t_width = tile_dict['width'] / 2**w t_height = tile_dict['height'] / 2**w stitched_x = int(max(tx, top_left[0]) - top_left[0]) stitched_y = int(max(ty, top_left[1]) - top_left[1]) stitched_w = int( min(t_width - max(top_left[0] - tx, 0), settings.CLIENT_TILE_SIZE - stitched_x)) stitched_h = int( min(t_height - max(top_left[1] - ty, 0), settings.CLIENT_TILE_SIZE - stitched_y)) t_sub_x = int(max(tx, top_left[0]) - tx) t_sub_y = int(max(ty, top_left[1]) - ty) stitched[ stitched_y:stitched_y + stitched_h, stitched_x:stitched_x + stitched_w] = current_tile[ t_sub_y:t_sub_y + stitched_h, t_sub_x:t_sub_x + stitched_w] if settings.INVERT: stitched = 255 - stitched if settings.CACHE_CLIENT_TILES: # print 'Writing OSD tile', osd_file_url_full cv2.imwrite(osd_file_url_full, stitched) return cv2.imencode('.jpg', stitched)[1].tostring() # Helping function @staticmethod def downsample_image(imagedata, factor): ''' ''' if factor == 1.: return imagedata factor = 1. / factor return cv2.resize(imagedata, (0, 0), fx=factor, fy=factor, interpolation=cv2.INTER_LINEAR)
class FCP(BaseTask): def __init__(self, circle, src, dest, treewalk=None, totalsize=0, hostcnt=0, prune=False, verify=False, resume=False, workq=None): BaseTask.__init__(self, circle) self.circle = circle self.treewalk = treewalk self.totalsize = totalsize self.prune = prune self.workq = workq self.resume = resume self.checkpoint_file = None self.src = src self.dest = os.path.abspath(dest) # cache, keep the size conservative # TODO: we need a more portable LRU size if hostcnt != 0: max_ofile, _ = resource.getrlimit(resource.RLIMIT_NOFILE) procs_per_host = self.circle.size / hostcnt self._read_cache_limit = ((max_ofile - 64) / procs_per_host) / 3 self._write_cache_limit = ((max_ofile - 64) / procs_per_host) * 2 / 3 if self._read_cache_limit <= 0 or self._write_cache_limit <= 0: self._read_cache_limit = 1 self._write_cache_limit = 8 self.rfd_cache = LRU(self._read_cache_limit) self.wfd_cache = LRU(self._write_cache_limit) self.cnt_filesize_prior = 0 self.cnt_filesize = 0 self.blocksize = 1024 * 1024 self.chunksize = 1024 * 1024 # debug self.d = {"rank": "rank %s" % circle.rank} self.wtime_started = MPI.Wtime() self.wtime_ended = None self.workcnt = 0 # this is the cnt for the enqued items self.reduce_items = 0 # this is the cnt for processed items if self.treewalk: log.debug("treewalk files = %s" % treewalk.flist, extra=self.d) # fini_check self.fini_cnt = Counter() # verify self.verify = verify self.chunksums = [] # checkpointing self.checkpoint_interval = sys.maxsize self.checkpoint_last = MPI.Wtime() if self.circle.rank == 0: print("Start copying process ...") def rw_cache_limit(self): return (self._read_cache_limit, self._write_cache_limit) def set_fixed_chunksize(self, sz): self.chunksize = sz def set_adaptive_chunksize(self, totalsz): self.chunksize = utils.calc_chunksize(totalsz) if self.circle.rank == 0: print("Adaptive chunksize: %s" % bytes_fmt(self.chunksize)) def cleanup(self): for f in self.rfd_cache.values(): try: os.close(f) except OSError as e: pass for f in self.wfd_cache.values(): try: os.close(f) except OSError as e: pass # remove checkpoint file if self.checkpoint_file and os.path.exists(self.checkpoint_file): os.remove(self.checkpoint_file) # we need to do this because if last job didn't finish cleanly # the fwalk files can be found as leftovers # and if fcp cleanup has a chance, it should clean up that fwalk = "%s/fwalk.%s" % (self.circle.tempdir, self.circle.rank) if os.path.exists(fwalk): os.remove(fwalk) def new_fchunk(self, fitem): fchunk = FileChunk() # default cmd = copy fchunk.src = fitem.path fchunk.dest = destpath(fitem, self.dest) return fchunk def enq_file(self, fi): """ Process a single file, represented by "fi" - FileItem It involves chunking this file and equeue all chunks. """ chunks = fi.st_size / self.chunksize remaining = fi.st_size % self.chunksize workcnt = 0 if fi.st_size == 0: # empty file fchunk = self.new_fchunk(fi) fchunk.offset = 0 fchunk.length = 0 self.enq(fchunk) workcnt += 1 else: for i in range(chunks): fchunk = self.new_fchunk(fi) fchunk.offset = i * self.chunksize fchunk.length = self.chunksize self.enq(fchunk) workcnt += chunks if remaining > 0: # send remainder fchunk = self.new_fchunk(fi) fchunk.offset = chunks * self.chunksize fchunk.length = remaining self.enq(fchunk) workcnt += 1 # save work cnt self.workcnt += workcnt log.debug("enq_file(): %s, size = %s, workcnt = %s" % (fi.path, fi.st_size, workcnt), extra=self.d) def handle_fitem(self, fi): if os.path.islink(fi.path): dest = destpath(fi, self.dest) linkto = os.readlink(fi.path) try: os.symlink(linkto, dest) except Exception as e: log.debug("%s, skipping sym link %s" % (e, fi.path), extra=self.d) elif stat.S_ISREG(fi.st_mode): self.enq_file(fi) # where chunking takes place def create(self): """ Each task has one create(), which is invoked by circle ONCE. For FCP, each task will handle_fitem() -> enq_file() to process each file gathered during the treewalk stage. """ if not G.use_store and self.workq: # restart self.setq(self.workq) return if self.resume: return # construct and enable all copy operations # we batch operation hard-coded log.info("create() starts, flist length = %s" % len(self.treewalk.flist), extra=self.d) if G.use_store: while self.treewalk.flist.qsize > 0: fitems, _ = self.treewalk.flist.mget(G.DB_BUFSIZE) for fi in fitems: self.handle_fitem(fi) self.treewalk.flist.mdel(G.DB_BUFSIZE) # store checkpoint log.debug("dbname = %s" % self.circle.dbname) dirname = os.path.dirname(self.circle.dbname) basename = os.path.basename(self.circle.dbname) chkpointname = basename + ".CHECK_OK" self.checkpoint_file = os.path.join(dirname, chkpointname) with open(self.checkpoint_file, "w") as f: f.write("%s" % self.totalsize) else: # use memory for fi in self.treewalk.flist: self.handle_fitem(fi) # memory-checkpoint if self.checkpoint_file: self.do_no_interrupt_checkpoint() self.checkpoint_last = MPI.Wtime() def do_open(self, k, d, flag, limit): """ @param k: the file path @param d: dictionary of <path, file descriptor> @return: file descriptor """ if d.has_key(k): return d[k] if len(d.keys()) >= limit: # over the limit # clean up the least used old_k, old_v = d.items()[-1] try: os.close(old_v) except OSError as e: log.warn("FD for %s not valid when closing" % old_k, extra=self.d) fd = -1 try: fd = os.open(k, flag) except OSError as e: if e.errno == 28: # no space left log.error("Critical error: %s, exit!" % e, extra=self.d) self.circle.exit(0) # should abort else: log.error("OSError({0}):{1}, skipping {2}".format(e.errno, e.strerror, k), extra=self.d) else: if fd > 0: d[k] = fd finally: return fd @staticmethod def do_mkdir(work): src = work.src dest = work.dest if not os.path.exists(dest): os.makedirs(dest) def do_copy(self, work): src = work.src dest = work.dest basedir = os.path.dirname(dest) if not os.path.exists(basedir): os.makedirs(basedir) rfd = self.do_open(src, self.rfd_cache, os.O_RDONLY, self._read_cache_limit) if rfd < 0: return False wfd = self.do_open(dest, self.wfd_cache, os.O_WRONLY | os.O_CREAT, self._write_cache_limit) if wfd < 0: if args.force: try: os.unlink(dest) except OSError as e: log.error("Failed to unlink %s, %s " % (dest, e), extra=self.d) return False else: wfd = self.do_open(dest, self.wfd_cache, os.O_WRONLY, self._write_cache_limit) else: log.error("Failed to create output file %s" % dest, extra=self.d) return False # do the actual copy self.write_bytes(rfd, wfd, work) # update tally self.cnt_filesize += work.length if G.verbosity > 2: log.debug("Transferred %s bytes from:\n\t [%s] to [%s]" % (self.cnt_filesize, src, dest), extra=self.d) return True def do_no_interrupt_checkpoint(self): a = Thread(target=self.do_checkpoint) a.start() a.join() log.debug("checkpoint: %s" % self.checkpoint_file, extra=self.d) def do_checkpoint(self): for k in self.wfd_cache.keys(): os.close(self.wfd_cache[k]) # clear the cache self.wfd_cache.clear() tmp_file = self.checkpoint_file + ".part" with open(tmp_file, "wb") as f: cobj = Checkpoint(self.src, self.dest, self.get_workq(), self.totalsize) pickle.dump(cobj, f, pickle.HIGHEST_PROTOCOL) # POSIX requires rename to be atomic os.rename(tmp_file, self.checkpoint_file) def process(self): """ The only work is "copy" TODO: clean up other actions such as mkdir/fini_check """ if not G.use_store: curtime = MPI.Wtime() if curtime - self.checkpoint_last > self.checkpoint_interval: self.do_no_interrupt_checkpoint() log.info("Checkpointing done ...", extra=self.d) self.checkpoint_last = curtime work = self.deq() self.reduce_items += 1 if isinstance(work, FileChunk): self.do_copy(work) else: log.warn("Unknown work object: %s" % work, extra=self.d) def reduce_init(self, buf): buf['cnt_filesize'] = self.cnt_filesize def reduce(self, buf1, buf2): buf1['cnt_filesize'] += buf2['cnt_filesize'] return buf1 def reduce_report(self, buf): out = "" if self.totalsize != 0: out += "%.2f %% finished, " % (100 * float(buf['cnt_filesize']) / self.totalsize) out += "%s copied" % bytes_fmt(buf['cnt_filesize']) if self.circle.reduce_time_interval != 0: rate = float(buf['cnt_filesize'] - self.cnt_filesize_prior) / self.circle.reduce_time_interval self.cnt_filesize_prior = buf['cnt_filesize'] out += ", estimated transfer rate: %s/s" % bytes_fmt(rate) print(out) def reduce_finish(self, buf): # self.reduce_report(buf) pass def epilogue(self): global taskloads self.wtime_ended = MPI.Wtime() taskloads = self.circle.comm.gather(self.reduce_items) if self.circle.rank == 0: if self.totalsize == 0: print("\nZero filesize detected, done.\n") return tlapse = self.wtime_ended - self.wtime_started rate = float(self.totalsize) / tlapse print("\nFCP Epilogue:\n") print("\t{:<20}{:<20}".format("Ending at:", utils.current_time())) print("\t{:<20}{:<20}".format("Completed in:", utils.conv_time(tlapse))) print("\t{:<20}{:<20}".format("Transfer Rate:", "%s/s" % bytes_fmt(rate))) print("\t{:<20}{:<20}".format("FCP Loads:", "%s" % taskloads)) def read_then_write(self, rfd, wfd, work, num_of_bytes, m): """ core entry point for copy action: first read then write. @param num_of_bytes: the exact amount of bytes we will copy @return: False if unsuccessful. """ buf = None try: buf = readn(rfd, num_of_bytes) except IOError: self.logger.error("Failed to read %s", work.src, extra=self.d) return False try: writen(wfd, buf) except IOError: self.logger.error("Failed to write %s", work.dest, extra=self.d) return False if m: m.update(buf) return True def write_bytes(self, rfd, wfd, work): os.lseek(rfd, work.offset, os.SEEK_SET) os.lseek(wfd, work.offset, os.SEEK_SET) m = None if self.verify: m = hashlib.sha1() remaining = work.length while remaining != 0: if remaining >= self.blocksize: self.read_then_write(rfd, wfd, work, self.blocksize, m) remaining -= self.blocksize else: self.read_then_write(rfd, wfd, work, remaining, m) remaining = 0 if self.verify: # use src path here ck = ChunkSum(work.src, offset=work.offset, length=work.length, digest=m.hexdigest()) self.chunksums.append(ck)
class topic4: def __init__(self, c_hash, c_user, c_words): self.topic_count =1 # self.time = (self.first,self.last) self.l1 = LRU(c_hash) self.first ="" self.last="" self.lats=[] self.longs=[] self.l2 = LRU(c_user) self.l3 = LRU(c_words) self.l4 = LRU(400) def set_hashLRU(self,l): self.set(self.l1, l) def set_userLRU(self,l): self.set(self.l2, l) def set_wordLRU(self,l): self.set(self.l3, l) def set(self, lru, l): for k in l: v = lru.get(k,0) lru[k]=v+1 def set_cluster(self, hashtags, users, words,links, cords): for k in hashtags: self.l1[k]=self.l1.get(k,0)+1 for k in users: self.l2[k]=self.l2.get(k,0)+1 for k in words: self.l3[k]=self.l3.get(k,0)+1 for k in links: self.l4[k]=self.l4.get(k,0)+1 if(cords is not None): self.lats.append(cords["coordinates"][1]) self.longs.append(cords["coordinates"][0]) self.topic_count+=1 def get_similarity(self,hashtags,users,words): h_sum = 1 u_sum = 1 w_sum = 1 h_match =0 h_ind =0 u_ind =0 w_ind =0 c=0 h1 = self.l1.get_size() u1 = self.l2.get_size() w1 = self.l3.get_size() for h in hashtags: # l1_items=zip(*self.l1.items()) h_sum+= self.l1.get(h,0) if(self.l1.has_key(h)): ind = self.l1.keys().index(h) h_ind+= h1 - ind h_match+= 1 if ind<250 else 0 for u in users: u_sum+= self.l2.get(u,0) if(self.l2.has_key(u)): u_ind+= u1 - self.l2.keys().index(u) for w in words: w_sum+= self.l3.get(w,0) if(self.l3.has_key(w)): w_ind+= w1 - self.l3.keys().index(w) if(h_match !=0): c = h_match -1 # print(h_ind,h1,u_ind,u1,w_ind,w1, h_sum,w_sum,) similarity = (h_ind/(h1+1))*(h_sum/sum(self.l1.values() +[1])) + (u_ind/(u1+1))*(u_sum/sum(self.l2.values()+[1])) + (w_ind/(w1+1))*(w_sum/sum(self.l3.values()+[1])) +c return similarity def flush1(self, cache, size): if(len(cache.keys())>5): tokens = reversed(cache.keys()[5]) cache.clear() for i in tokens: cache[i]=1 def flush(self): self.flush1(self.l1,500) self.flush1(self.l2, 500) self.flush1(self.l3,3500) self.topic_count=1
class GelbooruViewer: API_URL = "https://gelbooru.com/index.php?page=dapi&s=post&q=index" MAX_ID = 1 MAX_ID_LOCK = Lock() MAX_CACHE_SIZE = 32 MAX_CACHE_TIME = 24 * 60 # minutes PICTURES_PER_TAG = 200 def __init__(self): self.session = requests.Session() self.session.headers.update( { 'Accept': 'application/json, application/xml', 'Accept-Language': 'en-US', 'User-Agent': 'Mozilla/5.0 GelbooruViewer/1.0 (+https://github.com/ArchieMeng/GelbooruViewer)' } ) # only cache for get_all with tags while pid is 0!!! if importlib.find_loader('lru'): from lru import LRU self.cache = LRU(GelbooruViewer.MAX_CACHE_SIZE) else: self.cache = dict() self.cache_lock = Lock() # occasionally update cache self.last_cache_used = time() self.update_cache_thread = Thread(target=self._update_cache_loop, daemon=True) self.update_cache_thread.start() # get latest image to update MAX_ID self.get(limit=0) def _update_cache(self, tags, num=None): """ Do the update cache task :param tags: tags of picture to update to cache :param num: amount of pictures :return: """ if tags: result = [*self.get_all_generator(tags, 0, num, thread_limit=1, limit=100)] if result: key = '+'.join(tags) with self.cache_lock: self.cache[key] = result def _update_cache_loop(self): """ Occasionally refresh cache. Clear cache if unused for a long time. :return: """ minutes = 2 * 60 while True: sleep(60 * minutes) if time() - self.last_cache_used > self.MAX_CACHE_TIME * 60: self.cache.clear() gc.collect() continue with self.cache_lock: keys = self.cache.keys() with ThreadPoolExecutor(max_workers=2) as executor: futures = [executor.submit(self._update_cache, key.split('+'), GelbooruViewer.PICTURES_PER_TAG) for key in keys] for future in as_completed(futures): try: result = future.result() print(result) except Exception as e: print("Exception happened in GelbooruViewer._update_cache_loop", type(e), e) def get_raw_content(self, **kwargs): content = None with self.session as session: response = session.get(GelbooruViewer.API_URL, params=kwargs) try: content = response.content except Exception as e: logging.error(str(e)) pass return content def get(self, **kwargs)->list: """ use Gelbooru api to fetch picture info. :param kwargs: allowed args includes limit: How many posts you want to retrieve. There is a hard limit of 100 posts per request. pid: The page number. cid: Change ID of the post. This is in Unix time so there are likely others with the same value if updated at the same time. tags: The tags to search for. Any tag combination that works on the web site will work here. This includes all the meta-tags. See cheatsheet for more information. :return: a list of type GelbooruPicture, if sth wrong happened, a empty list will be return """ attempt = 0 content = None while attempt < 3 and content is None: attempt += 1 content = self.get_raw_content(**kwargs) if content is None: return [] if isinstance(content, bytes): xml_str = content.decode('utf-8') else: xml_str = content root = ElementTree.fromstring(xml_str) posts = root.findall('post') picture_list = [] if posts: cur_max_id = int(posts[0].attrib['id']) with GelbooruViewer.MAX_ID_LOCK: GelbooruViewer.MAX_ID = max(GelbooruViewer.MAX_ID, cur_max_id) else: return None for post in posts: info = post.attrib picture_list.append( GelbooruPicture( info['width'], info['height'], info['score'], info['source'], "https:"+info['preview_url'], "https:"+info['sample_url'], "https:"+info['file_url'], info['created_at'], info['creator_id'], [tag for tag in info['tags'].split(' ') if tag and not tag.isspace()], info['id'], info['rating'] ) ) return picture_list def get_all(self, tags: list, pid=0, num=None, thread_limit=5, use_cache=True, limit=25): """ regardless of official request limit amount, use threading to request amount you want When pictures is found in cache, list is returned. When pictures is found but not in cache, generator is returned. Else, None is returned :param limit: number of pictures in per request :param use_cache: whether prefer internal cache :param thread_limit: amount of threads running at the same time :param tags: tags must be provided :param pid: beginning page id , index from 0 :param num: num of picture you want. This function might return less pictures than u want only if Gelbooru hasn't got enough picture :return: a generator of gelboorupicture or list or None """ tags.sort() if use_cache and pid == 0: with self.cache_lock: key = '+'.join(tags) if key in self.cache and isinstance(self.cache[key], list): self.last_cache_used = time() if not num: return self.cache[key] else: return self.cache[key][:num] elif key not in self.cache or isinstance(self.cache[key], str): self.last_cache_used = time() # only one thread is executed during update. When update executed, a str is put into cache self.cache[key] = "executing" # currently cache size is limited in cate of Memory leak. thread = Thread( target=self._update_cache, args=(tags, GelbooruViewer.PICTURES_PER_TAG), daemon=True ) thread.start() content = self.get_raw_content(tags=tags, limit=0) xml_str = content.decode('utf-8') root = ElementTree.fromstring(xml_str) try: total = int(root.attrib['count']) except: return None if total > 0: return self.get_all_generator(tags, pid, num, thread_limit, total, limit) else: return None def get_all_generator( self, tags: list, pid=0, num=None, thread_limit=5, total=None, limit=25 ): """ True function of get all. Generator is returned :param thread_limit: max threads to fetch pictures at one time :param tags: tags of pictures :param pid: beginning page id , index from 0 :param num: num of picture you want.num of picture you want. This function might return less pictures than u want only if Gelbooru hasn't got enough picture :param total: total amount of picture, just set None if u don't know it. This is used by internal function :param limit: picture number per request. Generally, limit=10 cost 1.2s per request, while 25 cost 1.4s, 50 cost 2.2s, 100 cost 2.6s. The Larger limit , the faster speed in per request, but larger in total get_all timing. :return: """ if limit < 0 or limit > 100: limit = 10 def _get(tags, pid): content = self.get_raw_content(tags=tags, limit=limit, pid=pid) xml_string = content.decode() posts = ElementTree.fromstring(xml_string).findall('post') return posts if total is None: content = self.get_raw_content(tags=tags, limit=0) xml_str = content.decode('utf-8') root = ElementTree.fromstring(xml_str) total = int(root.attrib['count']) if isinstance(num, int): if num > 0: # if total amount is too large, use num instead. total = min(total, num) if tags and total > 0: with ThreadPoolExecutor(max_workers=thread_limit) as executor: final_pid = int(total / limit) start = pid tasks = [] while start < final_pid + 1: futures2idx = { executor.submit(_get, tags, i): i for i in tasks + [j for j in range(start, min(start + thread_limit, final_pid + 1))] } tasks = [] for future in as_completed(futures2idx): idx = futures2idx[future] try: posts = future.result() for post in posts: info = post.attrib yield GelbooruPicture( info['width'], info['height'], info['score'], info['source'], "https:" + info['preview_url'], "https:" + info['sample_url'], "https:" + info['file_url'], info['created_at'], info['creator_id'], [tag for tag in info['tags'].split(' ') if tag and not tag.isspace()], info['id'], info['rating'] ) except Exception as e: print("GelbooruViewer.get_all_generators raise", type(e), e) tasks.append(idx) start += thread_limit
class Manager(object): def __init__(self): ''' ''' self._views = LRU(50) # tile cache - enough for 1 MFOV for 10 parallel users self._tiles = LRU(61 * 10) self._client_tiles = {} def start(self): ''' ''' pass def check_path_type(self, data_path): ''' Check whether the data_path is a scan, section or fov. ''' # we should check how many levels deep is the IMAGE_COORDINATES_FILE # level 0: this is a FOV # level 1: this is a section # level 2: this is a scan if os.path.exists( os.path.join(data_path, settings.IMAGE_COORDINATES_FILE)): return 'FOV' if os.path.exists( os.path.join(data_path, Util.get_first_level_subdir(data_path), settings.IMAGE_COORDINATES_FILE)): return 'SECTION' if os.path.exists( os.path.join(data_path, Util.get_second_level_subdir(data_path), settings.IMAGE_COORDINATES_FILE)): return 'SCAN' return None def get_tree(self, data_path): ''' ''' if not data_path: data_path = settings.DEFAULT_DATA_FOLDER dir_content = sorted(Util.listdir(data_path)) dir_listing = [] for c in dir_content: full_url = os.path.join(data_path, c) # if not os.path.isdir(full_url): # continue entry = {} entry['label'] = c entry['full_url'] = full_url entry['id'] = os.path.join(data_path, c) entry['load_on_demand'] = True dir_listing.append(entry) return dir_listing def get_content(self, data_path): ''' Sends the content listing for a given path. This detects if the path is scan, section or fov. ''' views = [] path_type = self.check_path_type(data_path) # detect if this is a scan, section or fov if path_type == 'FOV': views.append({'data_path': data_path}) elif path_type == 'SECTION': views.append({'data_path': data_path}) elif path_type == 'SCAN': scan = Scan.from_directory(data_path, False) # lazy indexing for i, section in enumerate(scan._sections): views.append( {'data_path': os.path.join(data_path, section.id)}) return views def get_meta_info(self, data_path): ''' Get meta information for a requested data path. ''' if data_path not in self._views.keys(): path_type = self.check_path_type(data_path) # detect if this is a section or fov if path_type == 'FOV': # this is a FoV fov = FoV.from_directory(data_path, True) view = View.create(data_path, [fov], fov._width, fov._height, fov._tx, fov._ty, self) elif path_type == 'SECTION': section = Section.from_directory(data_path, True, True) view = View.create(data_path, section._fovs, section._width, section._height, section._tx, section._ty, self, section._luts64_map) # # and add to our views dictionary # self._views[data_path] = view else: view = self._views[data_path] meta_info = {} meta_info['width'] = view._width meta_info['height'] = view._height meta_info['layer'] = 0 meta_info['minLevel'] = 0 meta_info['maxLevel'] = 1 meta_info['tileSize'] = settings.CLIENT_TILE_SIZE meta_info['centers'] = view._centers return meta_info def get_image(self, data_path, x, y, z, w): ''' Calculate which file(s) we need for the current openseadragon tile and load them as well as downsample them on the fly. ''' # print '-'*80 # print 'SD', data_path, x, y, z, w if settings.CACHE_CLIENT_TILES: osd_file_url = (data_path.replace('/', '_') + '_' + str(x) + '_' + str(y) + '_' + str(z) + '_' + str(w) + '.jpg') osd_file_url_full = os.path.join(settings.CLIENT_TILE_CACHE_FOLDER, osd_file_url) if os.path.exists(osd_file_url_full): # we have this OSD tile cached on disk # print 'OSD CACHE HIT' osd_tile = cv2.imread(osd_file_url_full, 0) return cv2.imencode('.jpg', osd_tile)[1].tostring() view = self._views[data_path] # Create an empty dictionary for the View's luts64_map, if there isn't a map luts64_map = dict() if view._luts64_map is not None: luts64_map = view._luts64_map # calculate canvas coordinates x_c = x * settings.CLIENT_TILE_SIZE y_c = y * settings.CLIENT_TILE_SIZE w_c = settings.CLIENT_TILE_SIZE h_c = settings.CLIENT_TILE_SIZE top_left = [x_c, y_c] bottom_right = [x_c + w_c, y_c + h_c] # loop through all tiles and find ones which match the x_c, y_c, w_c, # h_c bounding box required_tiles = {} for t in view._tiles: tile_dict = view._tiles[t] tile = tile_dict['tile'] # now the normalized coordinates which should match the coordinate # system tx = tile_dict['tx'] / 2**w ty = tile_dict['ty'] / 2**w width = tile_dict['width'] / 2**w height = tile_dict['height'] / 2**w t_top_left = [tx, ty] t_bottom_right = [tx + width, ty + height] comp0 = top_left[0] < t_bottom_right[0] comp1 = bottom_right[0] > t_top_left[0] comp2 = top_left[1] < t_bottom_right[1] comp3 = bottom_right[1] > t_top_left[1] overlapping = comp0 and comp1 and comp2 and comp3 if overlapping: required_tiles[t] = tile_dict stitched_w = min(view._width / 2**w - x_c, settings.CLIENT_TILE_SIZE) stitched_h = min(view._height / 2**w - y_c, settings.CLIENT_TILE_SIZE) stitched = np.zeros((stitched_h, stitched_w), dtype=np.uint8) if settings.INVERT: stitched[:] = 255 # sort the required tiles to always give priority in the same order required_tiles_keys = sorted(required_tiles, key=lambda key: required_tiles[key]) for t in required_tiles_keys: tile_dict = required_tiles[t] tile = tile_dict['tile'] # fov paths need to be treated differently if self.check_path_type(data_path) != 'FOV': t_abs_data_path = os.path.join(data_path, tile_dict['fov']) else: t_abs_data_path = data_path # print 'LOADING', os.path.join(t_abs_data_path, tile._filename) if t in self._tiles.keys() and w in self._tiles[t]: current_tile = self._tiles[t][w] # print 'CACHE HIT' else: # # we add to cache # # print "Loading lut64_map of: {} --> {}".format(tile.id, luts64_map.get(os.path.split(tile.id)[-1].lower(), None)) tile_img = tile.load(t_abs_data_path, settings.IMAGE_PREFIX, lut_base64=luts64_map.get( os.path.split(tile.id)[-1].lower(), None)) current_tile = Manager.downsample_image(tile_img, 2**w) self._tiles[t] = {w: current_tile} # stitch it in our little openseadragon tile tx = tile_dict['tx'] / 2**w ty = tile_dict['ty'] / 2**w t_width = tile_dict['width'] / 2**w t_height = tile_dict['height'] / 2**w stitched_x = int(max(tx, top_left[0]) - top_left[0]) stitched_y = int(max(ty, top_left[1]) - top_left[1]) stitched_w = int( min(t_width - max(top_left[0] - tx, 0), settings.CLIENT_TILE_SIZE - stitched_x)) stitched_h = int( min(t_height - max(top_left[1] - ty, 0), settings.CLIENT_TILE_SIZE - stitched_y)) t_sub_x = int(max(tx, top_left[0]) - tx) t_sub_y = int(max(ty, top_left[1]) - ty) stitched[stitched_y:stitched_y + stitched_h, stitched_x:stitched_x + stitched_w] = current_tile[t_sub_y:t_sub_y + stitched_h, t_sub_x:t_sub_x + stitched_w] if settings.INVERT: stitched = 255 - stitched if settings.CACHE_CLIENT_TILES: # print 'Writing OSD tile', osd_file_url_full cv2.imwrite(osd_file_url_full, stitched) return cv2.imencode('.jpg', stitched)[1].tostring() # Helping function @staticmethod def downsample_image(imagedata, factor): ''' ''' if factor == 1.: return imagedata factor = 1. / factor return cv2.resize(imagedata, (0, 0), fx=factor, fy=factor, interpolation=cv2.INTER_LINEAR)
class MemoryStateManager: ''' Meaningless for anyting other than tests ''' def __init__(self, size=10): self.size = size self._data = LRU(self.size) self._locks = {} self._canceled = set() self.worker_id = uuid.uuid4().hex def set_loop(self, loop=None): pass async def update(self, task_id, data, ttl=None): # Updates existing data with new data existing = await self.get(task_id) existing.update(data) self._data[task_id] = existing async def get(self, task_id): return self._data.get(task_id, {}) async def exists(self, task_id): return task_id in self._data async def list(self): for task_id in self._data.keys(): yield task_id async def acquire(self, task_id, ttl): already_locked = await self.is_locked(task_id) if already_locked: raise TaskAlreadyAcquired(task_id) # Set new lock from guillotina_amqp.utils import TimeoutLock lock = TimeoutLock(self.worker_id) await lock.acquire(ttl=ttl) self._locks[task_id] = lock async def is_mine(self, task_id): if task_id not in self._locks: raise TaskNotFoundException(task_id) lock = self._locks[task_id] return lock.locked() and lock.worker_id == self.worker_id async def is_locked(self, task_id): if task_id not in self._locks: return False return self._locks[task_id].locked() async def release(self, task_id): if not await self.is_mine(task_id): # You can't refresh a lock that's not yours raise TaskAccessUnauthorized(task_id) # Release lock and pop it from data structure self._locks[task_id].release() self._locks.pop(task_id, None) async def refresh_lock(self, task_id, ttl): if task_id not in self._locks: raise TaskNotFoundException(task_id) if not await self.is_locked(task_id): raise Exception(f'Task {task_id} is not locked') if not await self.is_mine(task_id): # You can't refresh a lock that's not yours raise TaskAccessUnauthorized(task_id) # Refresh return await self._locks[task_id].refresh_lock(ttl) async def cancel(self, task_id): self._canceled.update({task_id}) return True async def cancelation_list(self): canceled = copy.deepcopy(self._canceled) for task_id in canceled: yield task_id async def clean_canceled(self, task_id): try: self._canceled.remove(task_id) return True except KeyError: # Task id wasn't canceled return False async def is_canceled(self, task_id): return task_id in self._canceled async def _clean(self): self._data = LRU(self.size) self._locks = {} self._canceled = set()
print(l.items()) # Prints items in MRU order # Would print [(4, '4'), (3, '3'), (2, '2'), (1, '1'), (0, '0')] print(l.peek_first_item(), l.peek_last_item()) #return the MRU key and LRU key # Would print (4, '4') (0, '0') l[5] = '5' # Inserting one more item should evict the old item print(l.items()) # Would print [(5, '5'), (4, '4'), (3, '3'), (2, '2'), (1, '1')] l[3] # Accessing an item would make it MRU print(l.items()) # Would print [(3, '3'), (5, '5'), (4, '4'), (2, '2'), (1, '1')] # Now 3 is in front l.keys() # Can get keys alone in MRU order # Would print [3, 5, 4, 2, 1] del l[4] # Delete an item print(l.items()) # Would print [(3, '3'), (5, '5'), (2, '2'), (1, '1')] print(l.get_size()) # Would print 5 l.set_size(3) print(l.items()) # Would print [(3, '3'), (5, '5'), (2, '2')] print(l.get_size()) # Would print 3 print(l.has_key(5))