Beispiel #1
0
    def test_callback(self):

        counter = [0]

        first_key = 'a'
        first_value = 1

        def callback(key, value):
            self.assertEqual(key, first_key)
            self.assertEqual(value, first_value)
            counter[0] += 1

        l = LRU(1, callback=callback)
        l[first_key] = first_value
        l['b'] = 1  # test calling the callback

        self.assertEqual(counter[0], 1)
        self.assertEqual(l.keys(), ['b'])

        l['b'] = 2  # doesn't call callback
        self.assertEqual(counter[0], 1)
        self.assertEqual(l.keys(), ['b'])
        self.assertEqual(l.values(), [2])

        l = LRU(1, callback=callback)
        l[first_key] = first_value

        l.set_callback(None)
        l['c'] = 1  # doesn't call callback
        self.assertEqual(counter[0], 1)
        self.assertEqual(l.keys(), ['c'])

        l.set_callback(callback)
        del l['c']  # doesn't call callback
        self.assertEqual(l.keys(), [])
Beispiel #2
0
class topic4:
    def __init__(self, c_hash, c_user, c_words):
        self.topic_count =1
        self.l1 = LRU(c_hash)
        self.l2 = LRU(c_user)

    def set_hashLRU(self,l):
        self.set(self.l1, l)

    def set_userLRU(self,l):
        self.set(self.l2, l)



    def set(self, lru, l):
        for k in l:
            v = lru.get(k,0)
            lru[k]=v+1

    def set_cluster(self, hashtags, users, words):
        for k in hashtags:
            self.l1[k]=self.l1.get(k,0)+1
        for k in users:
            self.l2[k]=self.l2.get(k,0)+1

        self.topic_count+=1

    def get_similarity(self,hashtags,users,words):
        h_sum = 1
        u_sum = 1
        w_sum = 1
        h_match =0
        h_ind =0
        u_ind =0
        w_ind =0
        c=0
        h1 = self.l1.get_size()
        u1 = self.l2.get_size()
        for h in hashtags:
            # l1_items=zip(*self.l1.items())
            h_sum+= self.l1.get(h,0)
            if(self.l1.has_key(h)):
                ind = self.l1.keys().index(h)
                h_ind+= h1 - ind
                h_match+= 1 if ind<250 else 0
        for u in users:
            u_sum+= self.l2.get(u,0)
            if(self.l2.has_key(u)):
                u_ind+= u1 - self.l2.keys().index(u)

        if(h_match !=0):
            c = h_match -1
        # print(h_ind,h1,u_ind,u1,w_ind,w1, h_sum,w_sum,)
        similarity = (h_ind/(h1+1))*(h_sum/sum(self.l1.values() +[1])) + (u_ind/(u1+1))*(u_sum/sum(self.l2.values()+[1]))  +c
        return similarity
Beispiel #3
0
    def test_lru(self):
        l = LRU(1)
        l['a'] = 1
        l['a']
        self.assertEqual(l.keys(), ['a'])
        l['b'] = 2
        self.assertEqual(l.keys(), ['b'])

        l = LRU(2)
        l['a'] = 1
        l['b'] = 2
        self.assertEqual(len(l), 2)
        l['a']                  # Testing the first one
        l['c'] = 3
        self.assertEqual(sorted(l.keys()), ['a', 'c'])
        l['c']
        self.assertEqual(sorted(l.keys()), ['a', 'c'])

        l = LRU(3)
        l['a'] = 1
        l['b'] = 2
        l['c'] = 3
        self.assertEqual(len(l), 3)
        l['b']                  # Testing the middle one
        l['d'] = 4
        self.assertEqual(sorted(l.keys()), ['b', 'c', 'd'])
        l['d']                  # Testing the last one
        self.assertEqual(sorted(l.keys()), ['b', 'c', 'd'])
        l['e'] = 5
        self.assertEqual(sorted(l.keys()), ['b', 'd', 'e'])
Beispiel #4
0
    def test_lru(self):
        l = LRU(1)
        l["a"] = 1
        l["a"]
        self.assertEqual(l.keys(), ["a"])
        l["b"] = 2
        self.assertEqual(l.keys(), ["b"])

        l = LRU(2)
        l["a"] = 1
        l["b"] = 2
        self.assertEqual(len(l), 2)
        l["a"]  # Testing the first one
        l["c"] = 3
        self.assertEqual(sorted(l.keys()), ["a", "c"])
        l["c"]
        self.assertEqual(sorted(l.keys()), ["a", "c"])

        l = LRU(3)
        l["a"] = 1
        l["b"] = 2
        l["c"] = 3
        self.assertEqual(len(l), 3)
        l["b"]  # Testing the middle one
        l["d"] = 4
        self.assertEqual(sorted(l.keys()), ["b", "c", "d"])
        l["d"]  # Testing the last one
        self.assertEqual(sorted(l.keys()), ["b", "c", "d"])
        l["e"] = 5
        self.assertEqual(sorted(l.keys()), ["b", "d", "e"])
Beispiel #5
0
    def test_lru(self):
        l = LRU(1)
        l['a'] = 1
        l['a']
        self.assertEqual(l.keys(), ['a'])
        l['b'] = 2
        self.assertEqual(l.keys(), ['b'])

        l = LRU(2)
        l['a'] = 1
        l['b'] = 2
        self.assertEqual(len(l), 2)
        l['a']                  # Testing the first one
        l['c'] = 3
        self.assertEqual(sorted(l.keys()), ['a', 'c'])
        l['c']
        self.assertEqual(sorted(l.keys()), ['a', 'c'])

        l = LRU(3)
        l['a'] = 1
        l['b'] = 2
        l['c'] = 3
        self.assertEqual(len(l), 3)
        l['b']                  # Testing the middle one
        l['d'] = 4
        self.assertEqual(sorted(l.keys()), ['b', 'c', 'd'])
        l['d']                  # Testing the last one
        self.assertEqual(sorted(l.keys()), ['b', 'c', 'd'])
        l['e'] = 5
        self.assertEqual(sorted(l.keys()), ['b', 'd', 'e'])
def test_bench_with_original(benchmark, data, collector):
    m = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    c1 = LRU(2000)
    benchmark.pedantic(run_cache, args=(c1, data), iterations=1, rounds=100)
    hits, misses = c1.get_stats()
    items = len(c1.keys())
    del c1
    gc.collect()
    collector(
        dict(hits=hits,
             misses=misses,
             items=items,
             memory=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss - m))
Beispiel #7
0
    def test_callback(self):

        counter = [0]

        first_key = 'a'
        first_value = 1

        def callback(key, value):
            self.assertEqual(key, first_key)
            self.assertEqual(value, first_value)
            counter[0] += 1

        l = LRU(1, callback=callback)
        l[first_key] = first_value
        l['b'] = 1              # test calling the callback

        self.assertEqual(counter[0], 1)
        self.assertEqual(l.keys(), ['b'])

        l['b'] = 2              # doesn't call callback
        self.assertEqual(counter[0], 1)
        self.assertEqual(l.keys(), ['b'])
        self.assertEqual(l.values(), [2])


        l = LRU(1, callback=callback)
        l[first_key] = first_value

        l.set_callback(None)
        l['c'] = 1              # doesn't call callback
        self.assertEqual(counter[0], 1)
        self.assertEqual(l.keys(), ['c'])

        l.set_callback(callback)
        del l['c']              # doesn't call callback
        self.assertEqual(l.keys(), [])
Beispiel #8
0
class FCP(BaseTask):
    def __init__(self, circle, src, dest,
                 treewalk=None,
                 totalsize=0,
                 hostcnt=0,
                 prune=False,
                 verify=False,
                 resume=False,
                 workq=None):
        BaseTask.__init__(self, circle)
        self.circle = circle
        self.treewalk = treewalk
        self.totalsize = totalsize
        self.prune = prune
        self.workq = workq
        self.resume = resume
        self.checkpoint_file = None
        self.checkpoint_db = None
        self.src = src
        self.dest = os.path.abspath(dest)

        # cache, keep the size conservative
        # TODO: we need a more portable LRU size

        if hostcnt != 0:
            max_ofile, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
            procs_per_host = self.circle.size / hostcnt
            self._read_cache_limit = ((max_ofile - 64) / procs_per_host) / 3
            self._write_cache_limit = ((max_ofile - 64) / procs_per_host) * 2 / 3

        if self._read_cache_limit <= 0 or self._write_cache_limit <= 0:
            self._read_cache_limit = 1
            self._write_cache_limit = 8

        self.rfd_cache = LRU(self._read_cache_limit)
        self.wfd_cache = LRU(self._write_cache_limit)

        self.cnt_filesize_prior = 0
        self.cnt_filesize = 0

        self.blocksize = 1024 * 1024
        self.chunksize = 1024 * 1024

        # debug
        self.d = {"rank": "rank %s" % circle.rank}
        self.wtime_started = MPI.Wtime()
        self.wtime_ended = None
        self.workcnt = 0  # this is the cnt for the enqued items
        self.reduce_items = 0  # this is the cnt for processed items
        if self.treewalk:
            log.debug("treewalk files = %s" % treewalk.flist, extra=self.d)

        # fini_check
        self.fini_cnt = Counter()

        # verify
        self.verify = verify
        self.use_store = False
        if self.verify:
            self.chunksums_mem = []
            self.chunksums_buf = []

        # checkpointing
        self.checkpoint_interval = sys.maxsize
        self.checkpoint_last = MPI.Wtime()

        if self.circle.rank == 0:
            print("Start copying process ...")

    def rw_cache_limit(self):
        return (self._read_cache_limit, self._write_cache_limit)

    def set_fixed_chunksize(self, sz):
        self.chunksize = sz

    def set_adaptive_chunksize(self, totalsz):
        self.chunksize = utils.calc_chunksize(totalsz)
        if self.circle.rank == 0:
            print("Adaptive chunksize: %s" % bytes_fmt(self.chunksize))

    def cleanup(self):
        for f in self.rfd_cache.values():
            try:
                os.close(f)
            except OSError as e:
                pass

        for f in self.wfd_cache.values():
            try:
                os.close(f)
            except OSError as e:
                pass

        # remove checkpoint file
        if self.checkpoint_file and os.path.exists(self.checkpoint_file):
            os.remove(self.checkpoint_file)
        if self.checkpoint_db and os.path.exists(self.checkpoint_db):
            os.remove(self.checkpoint_db)

        # remove provided checkpoint file
        if G.resume and G.chk_file and os.path.exists(G.chk_file):
            os.remove(G.chk_file)
        if G.resume and G.chk_file_db and os.path.exists(G.chk_file_db):
            os.remove(G.chk_file_db)

        # remove chunksums file
        if self.verify:
            if hasattr(self, "chunksums_db"):
                self.chunksums_db.cleanup()

        # we need to do this because if last job didn't finish cleanly
        # the fwalk files can be found as leftovers
        # and if fcp cleanup has a chance, it should clean up that
        """
        fwalk = "%s/fwalk.%s" % (G.tempdir, self.circle.rank)
        if os.path.exists(fwalk):
            os.remove(fwalk)
        """

    def new_fchunk(self, fitem):
        fchunk = FileChunk()  # default cmd = copy
        fchunk.src = fitem.path
        fchunk.dest = destpath(fitem, self.dest)
        return fchunk

    def enq_file(self, fi):
        """ Process a single file, represented by "fi" - FileItem
        It involves chunking this file and equeue all chunks. """

        chunks = fi.st_size / self.chunksize
        remaining = fi.st_size % self.chunksize

        workcnt = 0

        if fi.st_size == 0:  # empty file
            fchunk = self.new_fchunk(fi)
            fchunk.offset = 0
            fchunk.length = 0
            self.enq(fchunk)
            workcnt += 1
        else:
            for i in range(chunks):
                fchunk = self.new_fchunk(fi)
                fchunk.offset = i * self.chunksize
                fchunk.length = self.chunksize
                self.enq(fchunk)
            workcnt += chunks

        if remaining > 0:
            # send remainder
            fchunk = self.new_fchunk(fi)
            fchunk.offset = chunks * self.chunksize
            fchunk.length = remaining
            self.enq(fchunk)
            workcnt += 1

        # save work cnt
        self.workcnt += workcnt

        log.debug("enq_file(): %s, size = %s, workcnt = %s" % (fi.path, fi.st_size, workcnt),
                     extra=self.d)

    def handle_fitem(self, fi):
        if os.path.islink(fi.path):
            dest = destpath(fi, self.dest)
            linkto = os.readlink(fi.path)
            try:
                os.symlink(linkto, dest)
            except Exception as e:
                log.debug("%s, skipping sym link %s" % (e, fi.path), extra=self.d)
        elif stat.S_ISREG(fi.st_mode):
            self.enq_file(fi)  # where chunking takes place

    def create(self):
        """ Each task has one create(), which is invoked by circle ONCE.
        For FCP, each task will handle_fitem() -> enq_file()
        to process each file gathered during the treewalk stage. """

        if not G.use_store and self.workq:  # restart
            self.setq(self.workq)
            return

        if self.resume:
            return

        # construct and enable all copy operations
        # we batch operation hard-coded
        log.info("create() starts, flist length = %s" % len(self.treewalk.flist),
                    extra=self.d)

        # flist in memory
        if len(self.treewalk.flist) > 0:
            for fi in self.treewalk.flist:
                self.handle_fitem(fi)

        # flist in buf
        if len(self.treewalk.flist_buf) > 0:
            for fi in self.treewalk.flist_buf:
                self.handle_fitem(fi)

        # flist in database
        if self.treewalk.use_store:
            while self.treewalk.flist_db.qsize > 0:
                fitems, _ = self.treewalk.flist_db.mget(G.DB_BUFSIZE)
                for fi in fitems:
                    self.handle_fitem(fi)
                self.treewalk.flist_db.mdel(G.DB_BUFSIZE)

        # both memory and databse checkpoint
        if self.checkpoint_file:
            self.do_no_interrupt_checkpoint()
            self.checkpoint_last = MPI.Wtime()

        # gather total_chunks
        self.circle.comm.barrier()
        G.total_chunks = self.circle.comm.allreduce(self.workcnt, op=MPI.SUM)
        #G.total_chunks = self.circle.comm.bcast(G.total_chunks)
        #print("Total chunks: ",G.total_chunks)

    def do_open(self, k, d, flag, limit):
        """
        @param k: the file path
        @param d: dictionary of <path, file descriptor>
        @return: file descriptor
        """
        if d.has_key(k):
            return d[k]

        if len(d.keys()) >= limit:
            # over the limit
            # clean up the least used
            old_k, old_v = d.items()[-1]
            try:
                os.close(old_v)
            except OSError as e:
                log.warn("FD for %s not valid when closing" % old_k, extra=self.d)

        fd = -1
        try:
            fd = os.open(k, flag)
        except OSError as e:
            if e.errno == 28:  # no space left
                log.error("Critical error: %s, exit!" % e, extra=self.d)
                self.circle.exit(0)  # should abort
            else:
                log.error("OSError({0}):{1}, skipping {2}".format(e.errno, e.strerror, k), extra=self.d)
        else:
            if fd > 0:
                d[k] = fd
        finally:
            return fd

    @staticmethod
    def do_mkdir(work):
        src = work.src
        dest = work.dest
        if not os.path.exists(dest):
            os.makedirs(dest)

    def do_copy(self, work):
        src = work.src
        dest = work.dest

        basedir = os.path.dirname(dest)
        if not os.path.exists(basedir):
            os.makedirs(basedir)

        rfd = self.do_open(src, self.rfd_cache, os.O_RDONLY, self._read_cache_limit)
        if rfd < 0:
            return False
        wfd = self.do_open(dest, self.wfd_cache, os.O_WRONLY | os.O_CREAT, self._write_cache_limit)
        if wfd < 0:
            if args.force:
                try:
                    os.unlink(dest)
                except OSError as e:
                    log.error("Failed to unlink %s, %s " % (dest, e), extra=self.d)
                    return False
                else:
                    wfd = self.do_open(dest, self.wfd_cache, os.O_WRONLY, self._write_cache_limit)
            else:
                log.error("Failed to create output file %s" % dest, extra=self.d)
                return False

        # do the actual copy
        self.write_bytes(rfd, wfd, work)

        # update tally
        self.cnt_filesize += work.length

        if G.verbosity > 2:
            log.debug("Transferred %s bytes from:\n\t [%s] to [%s]" %
                         (self.cnt_filesize, src, dest), extra=self.d)

        return True

    def do_no_interrupt_checkpoint(self):
        a = Thread(target=self.do_checkpoint)
        a.start()
        a.join()
        log.debug("checkpoint: %s" % self.checkpoint_file, extra=self.d)
        print("\nMake checkpoint files: ", self.checkpoint_file)

    def do_checkpoint(self):
        # when make checkpoint, first write workq and workq_buf into checkpoint file, then make a copy of workq_db if it exists
        for k in self.wfd_cache.keys():
            os.close(self.wfd_cache[k])

        # clear the cache
        self.wfd_cache.clear()

        tmp_file = self.checkpoint_file + ".part"
        with open(tmp_file, "wb") as f:
            self.circle.workq.extend(self.circle.workq_buf)
            self.circle.workq_buf.clear()
            cobj = Checkpoint(self.src, self.dest, self.get_workq(), self.totalsize)
            pickle.dump(cobj, f, pickle.HIGHEST_PROTOCOL)
        # POSIX requires rename to be atomic
        os.rename(tmp_file, self.checkpoint_file)

        # copy workq_db database file
        if hasattr(self.circle, "workq_db") and len(self.circle.workq_db) > 0:
            self.checkpoint_db = self.checkpoint_file + ".db"
            if not G.resume:
                shutil.copy2(self.circle.dbname, self.checkpoint_db)
            else:
                # in resume mode, make a copy of current workq db file, which is provided checkpoint db file
                self.workdir = os.getcwd()
                existingCheckpoint = os.path.join(self.workdir,".pcp_workq.%s.%s.db" % (G.rid, self.circle.rank))
                shutil.copy2(existingCheckpoint,self.checkpoint_db)

    def process(self):
        """
        The only work is "copy"
        TODO: clean up other actions such as mkdir/fini_check
        """
        if not G.use_store:
            curtime = MPI.Wtime()
            if curtime - self.checkpoint_last > self.checkpoint_interval:
                self.do_no_interrupt_checkpoint()
                log.info("Checkpointing done ...", extra=self.d)
                self.checkpoint_last = curtime

        work = self.deq()
        self.reduce_items += 1
        if isinstance(work, FileChunk):
            self.do_copy(work)
        else:
            log.warn("Unknown work object: %s" % work, extra=self.d)
            err_and_exit("Not a correct workq format")

    def reduce_init(self, buf):
        buf['cnt_filesize'] = self.cnt_filesize
        if sys.platform == 'darwin':
            buf['mem_snapshot'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        else:
            buf['mem_snapshot'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024

    def reduce(self, buf1, buf2):
        buf1['cnt_filesize'] += buf2['cnt_filesize']
        buf1['mem_snapshot'] += buf2['mem_snapshot']
        return buf1

    def reduce_report(self, buf):
        out = ""
        if self.totalsize != 0:
            out += "%.2f %% finished, " % (100 * float(buf['cnt_filesize']) / self.totalsize)

        out += "%s copied" % bytes_fmt(buf['cnt_filesize'])

        if self.circle.reduce_time_interval != 0:
            rate = float(buf['cnt_filesize'] - self.cnt_filesize_prior) / self.circle.reduce_time_interval
            self.cnt_filesize_prior = buf['cnt_filesize']
            out += ", estimated transfer rate: %s/s" % bytes_fmt(rate)

        out += ", memory usage: %s" % bytes_fmt(buf['mem_snapshot'])
        print(out)

    def reduce_finish(self, buf):
        # self.reduce_report(buf)
        pass

    def epilogue(self):
        global taskloads
        self.wtime_ended = MPI.Wtime()
        taskloads = self.circle.comm.gather(self.reduce_items)
        if self.circle.rank == 0:
            if self.totalsize == 0:
                print("\nZero filesize detected, done.\n")
                return
            tlapse = self.wtime_ended - self.wtime_started
            rate = float(self.totalsize) / tlapse
            print("\nFCP Epilogue:\n")
            print("\t{:<20}{:<20}".format("Ending at:", utils.current_time()))
            print("\t{:<20}{:<20}".format("Completed in:", utils.conv_time(tlapse)))
            print("\t{:<20}{:<20}".format("Transfer Rate:", "%s/s" % bytes_fmt(rate)))
            print("\t{:<20}{:<20}".format("Use store chunksums:", "%s" % self.use_store))
            print("\t{:<20}{:<20}".format("Use store workq:", "%s" % self.circle.use_store))
            print("\t{:<20}{:<20}".format("FCP Loads:", "%s" % taskloads))

    def read_then_write(self, rfd, wfd, work, num_of_bytes, m):
        """ core entry point for copy action: first read then write.

        @param num_of_bytes: the exact amount of bytes we will copy
        @return: False if unsuccessful.

        """
        buf = None
        try:
            buf = readn(rfd, num_of_bytes)
        except IOError:
            self.logger.error("Failed to read %s", work.src, extra=self.d)
            return False

        try:
            writen(wfd, buf)
        except IOError:
            self.logger.error("Failed to write %s", work.dest, extra=self.d)
            return False

        if m:
            m.update(buf)

        return True

    def write_bytes(self, rfd, wfd, work):
        os.lseek(rfd, work.offset, os.SEEK_SET)
        os.lseek(wfd, work.offset, os.SEEK_SET)

        m = None
        if self.verify:
            m = hashlib.sha1()

        remaining = work.length
        while remaining != 0:
            if remaining >= self.blocksize:
                self.read_then_write(rfd, wfd, work, self.blocksize, m)
                remaining -= self.blocksize
            else:
                self.read_then_write(rfd, wfd, work, remaining, m)
                remaining = 0

        if self.verify:
            # use src path here
            ck = ChunkSum(work.dest, offset=work.offset, length=work.length,
                          digest=m.hexdigest())

            if len(self.chunksums_mem) < G.memitem_threshold:
                self.chunksums_mem.append(ck)
            else:
                self.chunksums_buf.append(ck)
                if len(self.chunksums_buf) == G.DB_BUFSIZE:
                    if self.use_store == False:
                        self.workdir = os.getcwd()
                        self.chunksums_dbname = "%s/chunksums.%s" % (G.tempdir, self.circle.rank)
                        self.chunksums_db = DbStore(dbname=self.chunksums_dbname)
                        self.use_store = True
                    self.chunksums_db.mput(self.chunksums_buf)
                    del self.chunksums_buf[:]
Beispiel #9
0
 def test_empty(self):
     l = LRU(1)
     self.assertEquals([], l.keys())
     self.assertEquals([], l.values())
Beispiel #10
0
class Cache:
    """Class representing D3N."""

    # Replacement policies
    LRU = "LRU"
    LFU = "LFU"
    LRU_S = "LRU_S"
    FIFO = "FIFO"
    RAND = "RAND"

    # Write policies
    WRITE_BACK = "WB"
    WRITE_THROUGH = "WT"

    # Layer
    L1 = "L1"
    L2 = "L2"

    consistent = "consistent"
    rendezvous = "rendezvous"
    rr = "rr"

    def __init__(self, layer, size, replace_pol, write_pol, hash_ring,
                 hash_type, obj_size, full_size, logger):
        self._replace_pol = replace_pol  # Replacement policy
        self._write_pol = write_pol  # Write policy
        self._layer = layer  # Layer info
        self._size = size  # Cache size
        self.spaceLeft = size  # Cache size
        self._logger = logger
        self.hashmap = {}  # Mapping
        self.hash_ring = hash_ring
        self._hash_type = hash_type
        self._obj_size = obj_size

        if (self._size == 0):
            self.zerosize = True
            self._size = 1
        else:
            self.zerosize = False

        if (self._replace_pol == Cache.LRU):
            self.cache = LRU(self._size)
        elif (self._replace_pol == Cache.FIFO):
            self.cache = deque()
        elif (self._replace_pol == Cache.LRU_S):
            self.cache = LRU(self._size)
            self.shadow = LRU(full_size)
            self.hist = []
            for i in range(full_size):
                self.hist.append(0)

    # Statistics
        self._hit_count = 0
        self._miss_count = 0
        self._backend_bw = 0
        self._crossrack_bw = 0
        self._intrarack_bw = 0
        self.miss_lat = 0
        self.lat_count = 0

    def _insert1(self, key, size):
        # No eviction
        if not self.zerosize:
            if (self._replace_pol == Cache.LRU_S):
                self.shadow[key] = 1

            if (int(size) <= self.spaceLeft):
                if (self._replace_pol == Cache.LRU):
                    self.cache[key] = int(size)
                elif (self._replace_pol == Cache.LRU_S):
                    self.cache[key] = int(size)
                elif (self._replace_pol == Cache.FIFO):
                    self.cache.append(key)
                self.hashmap[key] = int(size)
                self.spaceLeft -= int(size)
            else:
                while (int(size) > self.spaceLeft):
                    self._evict()
                if (self._replace_pol == Cache.LRU):
                    self.cache[key] = int(size)
                elif (self._replace_pol == Cache.LRU_S):
                    self.cache[key] = int(size)
                elif (self._replace_pol == Cache.FIFO):
                    self.cache.append(key)
                self.hashmap[key] = int(size)
                self.spaceLeft -= int(size)

    def _insert(self, key, size):
        # No eviction
        if not self.zerosize:
            if (self._replace_pol == Cache.LRU_S):
                self.cache[key] = int(size)
                self.shadow[key] = int(size)
            elif (self._replace_pol == Cache.LRU):
                self.cache[key] = int(size)
            else:
                if (int(size) <= self.spaceLeft):
                    if (self._replace_pol == Cache.LRU):
                        self.cache[key] = int(size)
                    elif (self._replace_pol == Cache.LRU_S):
                        self.cache[key] = int(size)
                    elif (self._replace_pol == Cache.FIFO):
                        self.cache.append(key)
                    self.hashmap[key] = int(size)
                    self.spaceLeft -= int(size)
                else:
                    while (int(size) > self.spaceLeft):
                        self._evict()
                    if (self._replace_pol == Cache.LRU):
                        self.cache[key] = int(size)
                    elif (self._replace_pol == Cache.LRU_S):
                        self.cache[key] = int(size)
                    elif (self._replace_pol == Cache.FIFO):
                        self.cache.append(key)
                    self.hashmap[key] = int(size)
                    self.spaceLeft -= int(size)

    def read1(self, key, size):
        if self._layer == "BE":
            return 1
        if self.zerosize == True:
            return None
        """Read a object from the cache."""
        r = None

        if (self._replace_pol == Cache.LRU_S):
            if self.shadow.has_key(key):
                count = 0
                for i in self.shadow.keys():
                    if i == key:
                        self.hist[count] += 1
                        break
                    count += 1
                self.shadow[key] = 1

        if key in self.hashmap:
            if (self._replace_pol == Cache.LRU):
                self._update_use(key)
            elif (self._replace_pol == Cache.LRU_S):
                self._update_use(key)
            self._hit_count += 1
            r = 1
        else:
            self._miss_count += 1
        return r

    def read(self, key, size):
        if self._layer == "BE":
            return 1
        if self.zerosize == True:
            return None
        """Read a object from the cache."""
        r = None

        if (self._replace_pol == Cache.LRU_S):
            if self.cache.has_key(key):
                self._hit_count += 1
                self.cache[key] = self.cache[key]
                r = 1
            else:
                self._miss_count += 1

            if self.shadow.has_key(key):
                count = 0
                for i in self.shadow.keys():
                    if i == key:
                        self.hist[count] += 1
                        break
                    count += 1
                self.shadow[key] = 1

        else:
            if key in self.hashmap:
                if (self._replace_pol == Cache.LRU):
                    self._update_use(key)
                elif (self._replace_pol == Cache.LRU_S):
                    self._update_use(key)
                self._hit_count += 1
                r = 1
            else:
                self._miss_count += 1
        return r

    def checkKey(self, key):
        if self._layer == "BE":
            return 1
        if self.zerosize == True:
            return 0
        """Read a object from the cache."""
        r = 0

        if (self._replace_pol == Cache.LRU_S) or (self._replace_pol
                                                  == Cache.LRU):
            if self.cache.has_key(key):
                r = 1
            else:
                r = 0
        return r

    def _evict(self):
        if (self._replace_pol == Cache.LRU):
            id = self.cache.peek_last_item()[0]
            del self.cache[id]
        elif (self._replace_pol == Cache.LRU_S):
            id = self.cache.peek_last_item()[0]
            del self.cache[id]
        elif (self._replace_pol == Cache.FIFO):
            id = self.cache.popleft()
        self.spaceLeft += int(self.hashmap[id])
        del self.hashmap[id]

    def _update_use(self, key):
        """Update the use of a cache."""
        if (self._replace_pol == Cache.LRU):
            self.cache[key] = self.hashmap[key]
        if (self._replace_pol == Cache.LRU_S):
            self.cache[key] = self.hashmap[key]

    def set_cache_size(self, size):
        new_size = self.cache.get_size() + int(size)
        self.cache.set_size(int(new_size))

    def set_backend_bw(self, value):
        self._backend_bw += value

    def set_crossrack_bw(self, value):
        self._crossrack_bw += value

    def set_intrarack_bw(self, value):
        self._intrarack_bw += value

    def get_backend_bw(self):
        return self._backend_bw

    def get_crossrack_bw(self):
        return self._crossrack_bw

    def get_intrarack_bw(self):
        return self._intrarack_bw

    def get_replace_pol(self):
        return self._replace_pol

    def get_hit_count(self):
        return self._hit_count

    def get_miss_count(self):
        return self._miss_count

    def get_available_space(self):
        return self.spaceLeft

    def get_replace_poll(self):
        return self._replace_pol

    def reset_shadow_cache():
        self.shadow.clear()

    def print_cache(self):
        print self.cache

    def get_l2_address(self, key):
        if (self._hash_type == Cache.consistent):
            return self.hash_ring.get_node(key)
        elif (self._hash_type == Cache.rendezvous):
            return self.hash_ring.find_node(key)
        elif (self._hash_type == Cache.rr):
            val = key.split("_")[1]
            res = int(val) % int(self.hash_ring)
            return res
Beispiel #11
0
 def test_empty(self):
     l = LRU(1)
     self.assertEqual([], l.keys())
     self.assertEqual([], l.values())
Beispiel #12
0
class AccountDB(AccountDatabaseAPI):
    logger = get_extended_debug_logger('eth.db.account.AccountDB')

    def __init__(self,
                 db: AtomicDatabaseAPI,
                 state_root: Hash32 = BLANK_ROOT_HASH) -> None:
        r"""
        Internal implementation details (subject to rapid change):
        Database entries go through several pipes, like so...

        .. code::

            db > _batchdb ---------------------------> _journaldb ----------------> code lookups
             \
              -> _batchtrie -> _trie -> _trie_cache -> _journaltrie --------------> account lookups

        Journaling sequesters writes at the _journal* attrs ^, until persist is called.

        _batchtrie enables us to prune all trie changes while building
        state,  without deleting old trie roots.

        _batchdb and _batchtrie together enable us to make the state root,
        without saving everything to the database.

        _journaldb is a journaling of the keys and values used to store
        code and account storage.

        _trie is a hash-trie, used to generate the state root

        _trie_cache is a cache tied to the state root of the trie. It
        is important that this cache is checked *after* looking for
        the key in _journaltrie, because the cache is only invalidated
        after a state root change.

        _journaltrie is a journaling of the accounts (an address->rlp mapping,
        rather than the nodes stored by the trie). This enables
        a squashing of all account changes before pushing them into the trie.

        .. NOTE:: StorageDB works similarly

        AccountDB synchronizes the snapshot/revert/persist of both of the
        journals.
        """
        self._raw_store_db = KeyAccessLoggerAtomicDB(db,
                                                     log_missing_keys=False)
        self._batchdb = BatchDB(self._raw_store_db)
        self._batchtrie = BatchDB(self._raw_store_db,
                                  read_through_deletes=True)
        self._journaldb = JournalDB(self._batchdb)
        self._trie = HashTrie(
            HexaryTrie(self._batchtrie, state_root, prune=True))
        self._trie_logger = KeyAccessLoggerDB(self._trie,
                                              log_missing_keys=False)
        self._trie_cache = CacheDB(self._trie_logger)
        self._journaltrie = JournalDB(self._trie_cache)
        self._account_cache = LRU(2048)
        self._account_stores: Dict[Address, AccountStorageDatabaseAPI] = {}
        self._dirty_accounts: Set[Address] = set()
        self._root_hash_at_last_persist = state_root
        self._accessed_accounts: Set[Address] = set()
        self._accessed_bytecodes: Set[Address] = set()

    @property
    def state_root(self) -> Hash32:
        return self._trie.root_hash

    @state_root.setter
    def state_root(self, value: Hash32) -> None:
        if self._trie.root_hash != value:
            self._trie_cache.reset_cache()
            self._trie.root_hash = value

    def has_root(self, state_root: bytes) -> bool:
        return state_root in self._batchtrie

    #
    # Storage
    #
    def get_storage(self,
                    address: Address,
                    slot: int,
                    from_journal: bool = True) -> int:
        validate_canonical_address(address, title="Storage Address")
        validate_uint256(slot, title="Storage Slot")

        account_store = self._get_address_store(address)
        return account_store.get(slot, from_journal)

    def set_storage(self, address: Address, slot: int, value: int) -> None:
        validate_uint256(value, title="Storage Value")
        validate_uint256(slot, title="Storage Slot")
        validate_canonical_address(address, title="Storage Address")

        account_store = self._get_address_store(address)
        self._dirty_accounts.add(address)
        account_store.set(slot, value)

    def delete_storage(self, address: Address) -> None:
        validate_canonical_address(address, title="Storage Address")

        self._set_storage_root(address, BLANK_ROOT_HASH)
        self._wipe_storage(address)

    def _wipe_storage(self, address: Address) -> None:
        """
        Wipe out the storage, without explicitly handling the storage root update
        """
        account_store = self._get_address_store(address)
        self._dirty_accounts.add(address)
        account_store.delete()

    def _get_address_store(self,
                           address: Address) -> AccountStorageDatabaseAPI:
        if address in self._account_stores:
            store = self._account_stores[address]
        else:
            storage_root = self._get_storage_root(address)
            store = AccountStorageDB(self._raw_store_db, storage_root, address)
            self._account_stores[address] = store
        return store

    def _dirty_account_stores(
            self) -> Iterable[Tuple[Address, AccountStorageDatabaseAPI]]:
        for address in self._dirty_accounts:
            store = self._account_stores[address]
            yield address, store

    @to_tuple
    def _get_changed_roots(self) -> Iterable[Tuple[Address, Hash32]]:
        # list all the accounts that were changed, and their new storage roots
        for address, store in self._dirty_account_stores():
            if store.has_changed_root:
                yield address, store.get_changed_root()

    def _get_storage_root(self, address: Address) -> Hash32:
        account = self._get_account(address)
        return account.storage_root

    def _set_storage_root(self, address: Address,
                          new_storage_root: Hash32) -> None:
        account = self._get_account(address)
        self._set_account(address, account.copy(storage_root=new_storage_root))

    def _validate_flushed_storage(self, address: Address,
                                  store: AccountStorageDatabaseAPI) -> None:
        if store.has_changed_root:
            actual_storage_root = self._get_storage_root(address)
            expected_storage_root = store.get_changed_root()
            if expected_storage_root != actual_storage_root:
                raise ValidationError(
                    "Storage root was not saved to account before trying to persist roots. "
                    f"Account {address!r} had storage {actual_storage_root!r}, "
                    f"but should be {expected_storage_root!r}.")

    #
    # Balance
    #
    def get_balance(self, address: Address) -> int:
        validate_canonical_address(address, title="Storage Address")

        account = self._get_account(address)
        return account.balance

    def set_balance(self, address: Address, balance: int) -> None:
        validate_canonical_address(address, title="Storage Address")
        validate_uint256(balance, title="Account Balance")

        account = self._get_account(address)
        self._set_account(address, account.copy(balance=balance))

    #
    # Nonce
    #
    def get_nonce(self, address: Address) -> int:
        validate_canonical_address(address, title="Storage Address")

        account = self._get_account(address)
        return account.nonce

    def set_nonce(self, address: Address, nonce: int) -> None:
        validate_canonical_address(address, title="Storage Address")
        validate_uint256(nonce, title="Nonce")

        account = self._get_account(address)
        self._set_account(address, account.copy(nonce=nonce))

    def increment_nonce(self, address: Address) -> None:
        current_nonce = self.get_nonce(address)
        self.set_nonce(address, current_nonce + 1)

    #
    # Code
    #
    def get_code(self, address: Address) -> bytes:
        validate_canonical_address(address, title="Storage Address")

        code_hash = self.get_code_hash(address)
        if code_hash == EMPTY_SHA3:
            return b''
        else:
            try:
                return self._journaldb[code_hash]
            except KeyError:
                raise MissingBytecode(code_hash)  #from KeyError
            finally:
                if code_hash in self._get_accessed_node_hashes():
                    self._accessed_bytecodes.add(address)

    def set_code(self, address: Address, code: bytes) -> None:
        validate_canonical_address(address, title="Storage Address")
        validate_is_bytes(code, title="Code")

        account = self._get_account(address)

        code_hash = keccak(code)
        self._journaldb[code_hash] = code
        self._set_account(address, account.copy(code_hash=code_hash))

    def get_code_hash(self, address: Address) -> Hash32:
        validate_canonical_address(address, title="Storage Address")

        account = self._get_account(address)
        return account.code_hash

    def delete_code(self, address: Address) -> None:
        validate_canonical_address(address, title="Storage Address")

        account = self._get_account(address)
        self._set_account(address, account.copy(code_hash=EMPTY_SHA3))

    #
    # Account Methods
    #
    def account_has_code_or_nonce(self, address: Address) -> bool:
        return self.get_nonce(address) != 0 or self.get_code_hash(
            address) != EMPTY_SHA3

    def delete_account(self, address: Address) -> None:
        validate_canonical_address(address, title="Storage Address")

        # We must wipe the storage first, because if it's the first time we load it,
        #   then we want to load it with the original storage root hash, not the
        #   empty one. (in case of a later revert, we don't want to poison the storage cache)
        self._wipe_storage(address)

        if address in self._account_cache:
            del self._account_cache[address]
        del self._journaltrie[address]

    def account_exists(self, address: Address) -> bool:
        validate_canonical_address(address, title="Storage Address")
        account_rlp = self._get_encoded_account(address, from_journal=True)
        return account_rlp != b''

    def touch_account(self, address: Address) -> None:
        validate_canonical_address(address, title="Storage Address")

        account = self._get_account(address)
        self._set_account(address, account)

    def account_is_empty(self, address: Address) -> bool:
        return not self.account_has_code_or_nonce(
            address) and self.get_balance(address) == 0

    #
    # Internal
    #
    def _get_encoded_account(self,
                             address: Address,
                             from_journal: bool = True) -> bytes:
        self._accessed_accounts.add(address)
        lookup_trie = self._journaltrie if from_journal else self._trie_cache

        try:
            return lookup_trie[address]
        except trie_exceptions.MissingTrieNode as exc:
            raise MissingAccountTrieNode(*exc.args) from exc
        except KeyError:
            # In case the account is deleted in the JournalDB
            return b''

    def _get_account(self,
                     address: Address,
                     from_journal: bool = True) -> Account:
        if from_journal and address in self._account_cache.keys():
            return self._account_cache[address]

        rlp_account = self._get_encoded_account(address, from_journal)

        if rlp_account:
            account = rlp.decode(rlp_account, sedes=Account)
        else:
            account = Account()
        if from_journal:
            self._account_cache[address] = account
        return account

    def _set_account(self, address: Address, account: Account) -> None:
        self._account_cache[address] = account
        rlp_account = rlp.encode(account, sedes=Account)
        self._journaltrie[address] = rlp_account

    #
    # Record and discard API
    #
    def record(self) -> JournalDBCheckpoint:
        checkpoint = self._journaldb.record()
        self._journaltrie.record(checkpoint)

        for _, store in self._dirty_account_stores():
            store.record(checkpoint)
        return checkpoint

    def discard(self, checkpoint: JournalDBCheckpoint) -> None:
        self._journaldb.discard(checkpoint)
        self._journaltrie.discard(checkpoint)
        self._account_cache.clear()
        for _, store in self._dirty_account_stores():
            store.discard(checkpoint)

    def commit(self, checkpoint: JournalDBCheckpoint) -> None:
        self._journaldb.commit(checkpoint)
        self._journaltrie.commit(checkpoint)
        for _, store in self._dirty_account_stores():
            store.commit(checkpoint)

    def lock_changes(self) -> None:
        for _, store in self._dirty_account_stores():
            store.lock_changes()

    def make_state_root(self) -> Hash32:
        for _, store in self._dirty_account_stores():
            store.make_storage_root()

        for address, storage_root in self._get_changed_roots():
            if self.account_exists(address) or storage_root != BLANK_ROOT_HASH:
                self._set_storage_root(address, storage_root)

        self._journaldb.persist()

        diff = self._journaltrie.diff()
        if diff.deleted_keys() or diff.pending_items():
            # In addition to squashing (which is redundant here), this context manager causes
            # an atomic commit of the changes, so exceptions will revert the trie
            with self._trie.squash_changes() as memory_trie:
                self._apply_account_diff_without_proof(diff, memory_trie)

        self._journaltrie.reset()
        self._trie_cache.reset_cache()

        return self.state_root

    def persist(self) -> MetaWitnessAPI:
        self.make_state_root()

        # persist storage
        with self._raw_store_db.atomic_batch() as write_batch:
            for address, store in self._dirty_account_stores():
                self._validate_flushed_storage(address, store)
                store.persist(write_batch)

        for address, new_root in self._get_changed_roots():
            if new_root is None:
                raise ValidationError(
                    f"Cannot validate new root of account 0x{address.hex()} "
                    f"which has a new root hash of None")
            elif new_root not in self._raw_store_db and new_root != BLANK_ROOT_HASH:
                raise ValidationError(
                    "After persisting storage trie, a root node was not found. "
                    f"State root for account 0x{address.hex()} "
                    f"is missing for hash 0x{new_root.hex()}.")

        # generate witness (copy) before clearing the underlying data
        meta_witness = self._get_meta_witness()

        # reset local storage trackers
        self._account_stores = {}
        self._dirty_accounts = set()
        self._accessed_accounts = set()
        self._accessed_bytecodes = set()
        # We have to clear the account cache here so that future account accesses
        #   will get added to _accessed_accounts correctly. Account accesses that
        #   are cached do not add the address to the list of accessed accounts.
        self._account_cache.clear()

        # persist accounts
        self._validate_generated_root()
        new_root_hash = self.state_root
        with self._raw_store_db.atomic_batch() as write_batch:
            self._batchtrie.commit_to(write_batch, apply_deletes=False)
            self._batchdb.commit_to(write_batch, apply_deletes=False)
        self._root_hash_at_last_persist = new_root_hash

        return meta_witness

    def _get_accessed_node_hashes(self) -> Set[Hash32]:
        return cast(Set[Hash32], self._raw_store_db.keys_read)

    @to_dict
    def _get_access_list(
            self) -> Iterable[Tuple[Address, AccountQueryTracker]]:
        """
        Get the list of addresses that were accessed, whether the bytecode was accessed, and
        which storage slots were accessed.
        """
        for address in self._accessed_accounts:
            did_access_bytecode = address in self._accessed_bytecodes
            if address in self._account_stores:
                accessed_storage_slots = self._account_stores[
                    address].get_accessed_slots()
            else:
                accessed_storage_slots = frozenset()
            yield address, AccountQueryTracker(did_access_bytecode,
                                               accessed_storage_slots)

    def _get_meta_witness(self) -> MetaWitness:
        """
        Get a variety of metadata about the state witness needed to execute the block.

        This creates a copy, so that underlying changes do not affect the returned MetaWitness.
        """
        return MetaWitness(self._get_accessed_node_hashes(),
                           self._get_access_list())

    def _validate_generated_root(self) -> None:
        db_diff = self._journaldb.diff()
        if len(db_diff):
            raise ValidationError(
                f"AccountDB had a dirty db when it needed to be clean: {db_diff!r}"
            )
        trie_diff = self._journaltrie.diff()
        if len(trie_diff):
            raise ValidationError(
                f"AccountDB had a dirty trie when it needed to be clean: {trie_diff!r}"
            )

    def _apply_account_diff_without_proof(self, diff: DBDiff,
                                          trie: DatabaseAPI) -> None:
        """
        Apply diff of trie updates, when original nodes might be missing.
        Note that doing this naively will raise exceptions about missing nodes
        from *intermediate* trie roots. This captures exceptions and uses the previous
        trie root hash that will be recognized by other nodes.
        """
        # It's fairly common that when an account is deleted, we need to retrieve nodes
        # for accounts that were not needed during normal execution. We only need these
        # nodes to refactor the trie.
        for delete_key in diff.deleted_keys():
            try:
                del trie[delete_key]
            except trie_exceptions.MissingTrieNode as exc:
                raise MissingAccountTrieNode(
                    exc.missing_node_hash,
                    self._root_hash_at_last_persist,
                    exc.requested_key,
                ) from exc

        # It's fairly unusual, but possible, that setting an account will need unknown
        # nodes during a trie refactor. Here is an example that seems to cause it:
        #
        # Setup:
        #   - Root node is a branch, with 0 pointing to a leaf
        #   - The complete leaf key is (0, 1, 2), so (1, 2) is in the leaf node
        #   - We know the leaf node hash but not the leaf node body
        # Refactor that triggers missing node:
        #   - Add value with key (0, 3, 4)
        #   - We need to replace the current leaf node with a branch that points leaves at 1 and 3
        #   - The leaf for key (0, 1, 2) now contains only the (2) part, so needs to be rebuilt
        #   - We need the full body of the old (1, 2) leaf node, to rebuild

        for key, val in diff.pending_items():
            try:
                trie[key] = val
            except trie_exceptions.MissingTrieNode as exc:
                raise MissingAccountTrieNode(
                    exc.missing_node_hash,
                    self._root_hash_at_last_persist,
                    exc.requested_key,
                ) from exc
Beispiel #13
0
Datei: DND.py Projekt: tabzraz/RL
class DND:
    def __init__(self, kernel, num_neighbors, max_memory, embedding_size):
        # self.dictionary = LRUCache(max_memory)
        # self.kd_tree = kdtree.create(dimensions=embedding_size)
        # rnd_projection = RandomBinaryProjections("RBP", 8)
        # distance = EuclideanDistance()
        # nearest = NearestFilter(num_neighbors)
        # self.nearpy = Engine(dim=embedding_size, lshashes=[rnd_projection], distance=distance, vector_filters=[nearest], fetch_vector_filters=[])

        self.kd_tree = None
        # self.data = []

        # self.lshash = LSHash(hash_size=embedding_size, input_dim=embedding_size, num_hashtables=10)
        self.lru = LRU(size=max_memory)

        self.num_neighbors = num_neighbors
        self.kernel = kernel
        self.max_memory = max_memory
        self.embedding_size = embedding_size
        # self.keys_added = []

    def is_present(self, key):
        return tuple(key) in self.lru  # self.lru.has_key(tuple(key))
        # return self.dictionary.get(tuple(key)) is not None
        # return self.dictionary.get(tuple(key.data.cpu().numpy()[0])) is not None

    def get_value(self, key):
        return self.lru[tuple(key)]
        # return self.dictionary.get(tuple(key))
        # return self.dictionary.get(tuple(key.data.cpu().numpy()[0]))

    def lookup(self, lookup_key):
        # TODO: Speed up search knn
        # keys = [key[0].data for key in self.kd_tree.search_knn(lookup_key, self.num_neighbors)]
        lookup_key_numpy = lookup_key.data[0].numpy()
        # lookup_key_tuple = tuple(lookup_key_numpy)
        # print(lookup_key)

        # keys = [key[0] for key in self.lshash.query_no_data(lookup_key_numpy, num_results=self.num_neighbors)]
        # keys = [key[1] for key in self.nearpy.neighbours(lookup_key_numpy)]
        if self.kd_tree is not None:
            # print(len(self.lru.keys()), lookup_key_numpy)
            # things_distances, things_index = self.kd_tree.query(lookup_key_numpy, k=self.num_neighbors, eps=1.0)
            things_index = self.kd_tree.query([lookup_key_numpy],
                                              k=min(self.num_neighbors,
                                                    len(self.kd_tree.data)),
                                              return_distance=False,
                                              sort_results=False)
            # print(things_index)
            keys = [self.lru.keys()[ii[0]] for ii in things_index]
            # print(keys)
        else:
            keys = []

        # print(keys)
        # print(keys)
        # output, kernel_sum = Variable(FloatTensor([0])), Variable(FloatTensor([0]))
        output, kernel_sum = 0, 0
        # if len(keys) != 0:
        # print(keys)
        # TODO: Speed this up since the kernel takes a significant amount of time
        for key in keys:
            # print("Key:",key, lookup_key)
            # if not np.allclose(key, lookup_key_numpy): #(key == lookup_key).data.all():
            if not np.all(key == lookup_key_numpy):
                # print("Here")
                # gg = Variable(FloatTensor(np.array(key)))
                # print(key)
                # gg = Variable(FloatTensor(key))
                gg = Variable(torch.from_numpy(np.array(key)))
                # print(tuple(key))
                # hh = lookup_key[0] - gg
                # print("Key:", gg, "Lookup key", lookup_key[0])
                # print(lookup_key[0] + gg)
                kernel_val = self.kernel(gg, lookup_key[0])
                # print("key:", self.lru.get(tuple(key)))
                # if not self.lru.has_key(tuple(key)):
                # print(keys)
                # print(tuple(key))
                # print(key in self.keys_added)
                # print(len(self.lru))
                # if tuple(key) not in self.lru:
                # print("NOT IN:", tuple(key))
                # print(len(keys))
                output += kernel_val * self.lru.get(tuple(key))
                # output += kernel_val * self.dictionary.get(tuple(key))
                # print("Key", key.requires_grad, key.volatile)
                # print("Kernel key", self.kernel(key, lookup_key).requires_grad)
                # print("Output in loop", output.requires_grad)
                kernel_sum += kernel_val  #self.kernel(key, lookup_key)
                # print(kernel_sum)
        # if len(keys) == 0:
        #     return (lookup_key * 0)[0][0]
        if isinstance(kernel_sum, int):
            return (lookup_key * 0)[0][0]
        # if kernel_sum == 0:
        # print("0 Kernel", kernel_sum)
        # if len(keys) == 0:
        # print("0 keys", len(keys))
        if kernel_sum.data[0] == 0 or len(keys) == 0:
            # print(lookup_key)
            # zeroed = (lookup_key * 0)[0][0]
            # print("Zero Lookup.", output.data, kernel_sum.data, len(keys))
            return (lookup_key * 0)[0][0]
        # print("lookup_key", lookup_key.requires_grad, lookup_key.volatile)
        # print("kernled", self.kernel(keys[0], lookup_key).requires_grad)
        # print("output", output.requires_grad, output.volatile)
        # print("ks", kernel_sum.requires_grad, kernel_sum.volatile)
        # print("Non-Zero Lookup for {}".format(lookup_key))
        output = output / kernel_sum
        # print(output)
        return output

    def upsert(self, key, value):
        # key = key.data[0].numpy()
        # print(key)
        # self.keys_added.append(key)
        # if not self.lru.has_key(tuple(key)):# self.is_present(key):
        # self.kd_tree.add(key)
        # print("Key going in", key)
        # self.lshash.index(input_point=key)
        # self.nearpy.store_vector(key, data=key)

        # print("Adding", tuple(key), key)
        # neighbours = self.nearpy.neighbours(key)
        # print(neighbours)

        self.lru[tuple(key)] = value
        # self.kd_tree = KDTree(data=self.lru.keys(), compact_nodes=False, copy_data=False, balanced_tree=False)
        self.kd_tree = KDTree(self.lru.keys())

        return
        if len(self.lru) == self.max_memory:
            # Expel least recently used key from self.dictionary and self.kd_tree if memory used is at capacity
            # deleted_key = self.dictionary.delete_least_recently_used()[0]
            # deleted_key = self.lru.peek_last_item()[0]
            # print("Deleted key:",deleted_key)
            # deleted_key = np.array(deleted_key)
            # thing = Variable(torch.from_numpy(deleted_key).float()).unsqueeze(0)
            # thing = Variable(FloatTensor(deleted_key)).unsqueeze(0)
            # print("Thing:",thing)
            # print(self.dictionary.cache.keys())
            key_to_delete = self.lru.peek_last_item()
            self.lru[tuple(key)] = value
            # self.kd_tree.remove(Variable(FloatTensor(deleted_key)).unsqueeze(0))
            # self.kd_tree.remove(deleted_key)

            # Remake the LSHASH with the deleted key
            # print("remaking")

            # self.lshash = LSHash(hash_size=self.embedding_size, input_dim=self.embedding_size)
            # for k in self.lru.keys():
            #     self.lshash.index(np.array(k))

            # print("Deleting", np.array(key_to_delete[0]))
            # self.nearpy.delete_vector(key_to_delete[0])
            # self.nearpy.clean_all_buckets()
            # for k in self.lru.keys():
            # self.nearpy.store_vector(np.array(k))

            # Checking that the lru keys are the same as the keys in the lshash
            # for key in self.lru.keys():
            #     keys_close = [key[0] for key in self.lshash.query(key, num_results=5)]
            #     # print(keys_close)
            #     for kk in keys_close:
            #         if kk not in self.lru:
            #             print("\n\nProblems! Key in LSHASH not in LRU\n\n")

            # Check length of all lru keys
            # all_lru_keys = self.lshash.query(key)
            # print("\n", len(all_lru_keys), "\n")
        else:
            self.lru[tuple(key)] = value

        self.kdtree = KDTree(self.data)
Beispiel #14
0
class Manager(object):

    def __init__(self):
        '''
        '''
        self._views = LRU(50)
        # tile cache - enough for 1 MFOV for 10 parallel users
        self._tiles = LRU(61 * 10)

        self._client_tiles = {}

    def start(self):
        '''
        '''
        pass

    def check_path_type(self, data_path):
        '''
        Check whether the data_path is a scan, section or fov.
        '''

        # we should check how many levels deep is the IMAGE_COORDINATES_FILE
        # level 0: this is a FOV
        # level 1: this is a section
        # level 2: this is a scan

        if os.path.exists(
            os.path.join(
                data_path,
                settings.IMAGE_COORDINATES_FILE)):
            return 'FOV'

        if os.path.exists(
            os.path.join(
                data_path,
                Util.get_first_level_subdir(data_path),
                settings.IMAGE_COORDINATES_FILE)):
            return 'SECTION'

        if os.path.exists(
            os.path.join(
                data_path,
                Util.get_second_level_subdir(data_path),
                settings.IMAGE_COORDINATES_FILE)):
            return 'SCAN'

        return None

    def get_tree(self, data_path):
        '''
        '''

        if not data_path:
            data_path = settings.DEFAULT_DATA_FOLDER

        dir_content = sorted(Util.listdir(data_path))

        dir_listing = []

        for c in dir_content:

            full_url = os.path.join(data_path, c)

            # if not os.path.isdir(full_url):
            #   continue

            entry = {}
            entry['label'] = c
            entry['full_url'] = full_url
            entry['id'] = os.path.join(data_path, c)
            entry['load_on_demand'] = True

            dir_listing.append(entry)

        return dir_listing

    def get_content(self, data_path):
        '''
        Sends the content listing for a given path. This detects if the path is
        scan, section or fov.
        '''

        views = []

        path_type = self.check_path_type(data_path)

        # detect if this is a scan, section or fov
        if path_type == 'FOV':

            views.append({'data_path': data_path})

        elif path_type == 'SECTION':

            views.append({'data_path': data_path})

        elif path_type == 'SCAN':

            scan = Scan.from_directory(data_path, False)  # lazy indexing

            for i, section in enumerate(scan._sections):

                views.append(
                    {'data_path': os.path.join(data_path, section.id)})

        return views

    def get_meta_info(self, data_path):
        '''
        Get meta information for a requested data path.
        '''

        if data_path not in self._views.keys():

            path_type = self.check_path_type(data_path)

            # detect if this is a section or fov
            if path_type == 'FOV':
                # this is a FoV
                fov = FoV.from_directory(data_path, True)

                view = View.create(
                    data_path,
                    [fov],
                    fov._width,
                    fov._height,
                    fov._tx,
                    fov._ty,
                    self)

            elif path_type == 'SECTION':

                section = Section.from_directory(data_path, True, True)

                view = View.create(
                    data_path,
                    section._fovs,
                    section._width,
                    section._height,
                    section._tx,
                    section._ty,
                    self,
                    section._luts64_map)

            #
            # and add to our views dictionary
            #
            self._views[data_path] = view

        else:

            view = self._views[data_path]

        meta_info = {}
        meta_info['width'] = view._width
        meta_info['height'] = view._height
        meta_info['layer'] = 0
        meta_info['minLevel'] = 0
        meta_info['maxLevel'] = 1
        meta_info['tileSize'] = settings.CLIENT_TILE_SIZE
        meta_info['centers'] = view._centers

        return meta_info

    def get_image(self, data_path, x, y, z, w):
        '''
        Calculate which file(s) we need for the current openseadragon tile
        and load them as well as downsample them on the fly.
        '''

        # print '-'*80
        # print 'SD', data_path, x, y, z, w

        if settings.CACHE_CLIENT_TILES:

            osd_file_url = (data_path.replace('/', '_') + '_' + str(x) + '_' +
                            str(y) + '_' + str(z) + '_' + str(w) + '.jpg')
            osd_file_url_full = os.path.join(
                settings.CLIENT_TILE_CACHE_FOLDER, osd_file_url)

            if os.path.exists(osd_file_url_full):

                # we have this OSD tile cached on disk
                # print 'OSD CACHE HIT'
                osd_tile = cv2.imread(osd_file_url_full, 0)
                return cv2.imencode('.jpg', osd_tile)[1].tostring()

        view = self._views[data_path]

        # Create an empty dictionary for the View's luts64_map, if there isn't a map
        luts64_map = dict()
        if view._luts64_map is not None:
            luts64_map = view._luts64_map

        # calculate canvas coordinates
        x_c = x * settings.CLIENT_TILE_SIZE
        y_c = y * settings.CLIENT_TILE_SIZE
        w_c = settings.CLIENT_TILE_SIZE
        h_c = settings.CLIENT_TILE_SIZE

        top_left = [x_c, y_c]
        bottom_right = [x_c + w_c, y_c + h_c]

        # loop through all tiles and find ones which match the x_c, y_c, w_c,
        # h_c bounding box
        required_tiles = {}
        for t in view._tiles:
            tile_dict = view._tiles[t]

            tile = tile_dict['tile']
            # now the normalized coordinates which should match the coordinate
            # system
            tx = tile_dict['tx'] / 2**w
            ty = tile_dict['ty'] / 2**w
            width = tile_dict['width'] / 2**w
            height = tile_dict['height'] / 2**w
            t_top_left = [tx, ty]
            t_bottom_right = [tx + width, ty + height]

            comp0 = top_left[0] < t_bottom_right[0]
            comp1 = bottom_right[0] > t_top_left[0]
            comp2 = top_left[1] < t_bottom_right[1]
            comp3 = bottom_right[1] > t_top_left[1]

            overlapping = comp0 and comp1 and comp2 and comp3

            if overlapping:
                required_tiles[t] = tile_dict

        stitched_w = min(view._width / 2**w - x_c, settings.CLIENT_TILE_SIZE)
        stitched_h = min(view._height / 2**w - y_c, settings.CLIENT_TILE_SIZE)

        stitched = np.zeros((stitched_h, stitched_w), dtype=np.uint8)

        if settings.INVERT:
            stitched[:] = 255

        # sort the required tiles to always give priority in the same order
        required_tiles_keys = sorted(
            required_tiles, key=lambda key: required_tiles[key])

        for t in required_tiles_keys:

            tile_dict = required_tiles[t]
            tile = tile_dict['tile']

            # fov paths need to be treated differently
            if self.check_path_type(data_path) != 'FOV':
                t_abs_data_path = os.path.join(data_path, tile_dict['fov'])
            else:
                t_abs_data_path = data_path

            # print 'LOADING', os.path.join(t_abs_data_path, tile._filename)
            if t in self._tiles.keys() and w in self._tiles[t]:
                current_tile = self._tiles[t][w]
                # print 'CACHE HIT'
            else:
                #
                # we add to cache
                #
                # print "Loading lut64_map of: {} --> {}".format(tile.id, luts64_map.get(os.path.split(tile.id)[-1].lower(), None))
                tile_img = tile.load(t_abs_data_path, settings.IMAGE_PREFIX, lut_base64=luts64_map.get(os.path.split(tile.id)[-1].lower(), None))

                current_tile = Manager.downsample_image(tile_img, 2**w)
                self._tiles[t] = {w: current_tile}

            # stitch it in our little openseadragon tile
            tx = tile_dict['tx'] / 2**w
            ty = tile_dict['ty'] / 2**w
            t_width = tile_dict['width'] / 2**w
            t_height = tile_dict['height'] / 2**w

            stitched_x = int(max(tx, top_left[0]) - top_left[0])
            stitched_y = int(max(ty, top_left[1]) - top_left[1])

            stitched_w = int(
                min(t_width - max(top_left[0] - tx, 0),
                    settings.CLIENT_TILE_SIZE - stitched_x))
            stitched_h = int(
                min(t_height - max(top_left[1] - ty, 0),
                    settings.CLIENT_TILE_SIZE - stitched_y))

            t_sub_x = int(max(tx, top_left[0]) - tx)
            t_sub_y = int(max(ty, top_left[1]) - ty)

            stitched[
                stitched_y:stitched_y +
                stitched_h,
                stitched_x:stitched_x +
                stitched_w] = current_tile[
                    t_sub_y:t_sub_y +
                    stitched_h,
                    t_sub_x:t_sub_x +
                    stitched_w]

        if settings.INVERT:
            stitched = 255 - stitched

        if settings.CACHE_CLIENT_TILES:
            # print 'Writing OSD tile', osd_file_url_full
            cv2.imwrite(osd_file_url_full, stitched)

        return cv2.imencode('.jpg', stitched)[1].tostring()

    # Helping function
    @staticmethod
    def downsample_image(imagedata, factor):
        '''
        '''
        if factor == 1.:
            return imagedata

        factor = 1. / factor
        return cv2.resize(imagedata, (0, 0), fx=factor,
                          fy=factor, interpolation=cv2.INTER_LINEAR)
Beispiel #15
0
class FCP(BaseTask):
    def __init__(self, circle, src, dest,
                 treewalk=None,
                 totalsize=0,
                 hostcnt=0,
                 prune=False,
                 verify=False,
                 resume=False,
                 workq=None):
        BaseTask.__init__(self, circle)
        self.circle = circle
        self.treewalk = treewalk
        self.totalsize = totalsize
        self.prune = prune
        self.workq = workq
        self.resume = resume
        self.checkpoint_file = None
        self.src = src
        self.dest = os.path.abspath(dest)

        # cache, keep the size conservative
        # TODO: we need a more portable LRU size

        if hostcnt != 0:
            max_ofile, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
            procs_per_host = self.circle.size / hostcnt
            self._read_cache_limit = ((max_ofile - 64) / procs_per_host) / 3
            self._write_cache_limit = ((max_ofile - 64) / procs_per_host) * 2 / 3

        if self._read_cache_limit <= 0 or self._write_cache_limit <= 0:
            self._read_cache_limit = 1
            self._write_cache_limit = 8

        self.rfd_cache = LRU(self._read_cache_limit)
        self.wfd_cache = LRU(self._write_cache_limit)

        self.cnt_filesize_prior = 0
        self.cnt_filesize = 0

        self.blocksize = 1024 * 1024
        self.chunksize = 1024 * 1024

        # debug
        self.d = {"rank": "rank %s" % circle.rank}
        self.wtime_started = MPI.Wtime()
        self.wtime_ended = None
        self.workcnt = 0  # this is the cnt for the enqued items
        self.reduce_items = 0  # this is the cnt for processed items
        if self.treewalk:
            log.debug("treewalk files = %s" % treewalk.flist, extra=self.d)

        # fini_check
        self.fini_cnt = Counter()

        # verify
        self.verify = verify
        self.chunksums = []

        # checkpointing
        self.checkpoint_interval = sys.maxsize
        self.checkpoint_last = MPI.Wtime()

        if self.circle.rank == 0:
            print("Start copying process ...")

    def rw_cache_limit(self):
        return (self._read_cache_limit, self._write_cache_limit)

    def set_fixed_chunksize(self, sz):
        self.chunksize = sz

    def set_adaptive_chunksize(self, totalsz):
        self.chunksize = utils.calc_chunksize(totalsz)
        if self.circle.rank == 0:
            print("Adaptive chunksize: %s" % bytes_fmt(self.chunksize))

    def cleanup(self):
        for f in self.rfd_cache.values():
            try:
                os.close(f)
            except OSError as e:
                pass

        for f in self.wfd_cache.values():
            try:
                os.close(f)
            except OSError as e:
                pass

        # remove checkpoint file
        if self.checkpoint_file and os.path.exists(self.checkpoint_file):
            os.remove(self.checkpoint_file)

        # we need to do this because if last job didn't finish cleanly
        # the fwalk files can be found as leftovers
        # and if fcp cleanup has a chance, it should clean up that

        fwalk = "%s/fwalk.%s" % (self.circle.tempdir, self.circle.rank)
        if os.path.exists(fwalk):
            os.remove(fwalk)

    def new_fchunk(self, fitem):
        fchunk = FileChunk()  # default cmd = copy
        fchunk.src = fitem.path
        fchunk.dest = destpath(fitem, self.dest)
        return fchunk

    def enq_file(self, fi):
        """ Process a single file, represented by "fi" - FileItem
        It involves chunking this file and equeue all chunks. """

        chunks = fi.st_size / self.chunksize
        remaining = fi.st_size % self.chunksize

        workcnt = 0

        if fi.st_size == 0:  # empty file
            fchunk = self.new_fchunk(fi)
            fchunk.offset = 0
            fchunk.length = 0
            self.enq(fchunk)
            workcnt += 1
        else:
            for i in range(chunks):
                fchunk = self.new_fchunk(fi)
                fchunk.offset = i * self.chunksize
                fchunk.length = self.chunksize
                self.enq(fchunk)
            workcnt += chunks

        if remaining > 0:
            # send remainder
            fchunk = self.new_fchunk(fi)
            fchunk.offset = chunks * self.chunksize
            fchunk.length = remaining
            self.enq(fchunk)
            workcnt += 1

        # save work cnt
        self.workcnt += workcnt

        log.debug("enq_file(): %s, size = %s, workcnt = %s" % (fi.path, fi.st_size, workcnt),
                     extra=self.d)

    def handle_fitem(self, fi):
        if os.path.islink(fi.path):
            dest = destpath(fi, self.dest)
            linkto = os.readlink(fi.path)
            try:
                os.symlink(linkto, dest)
            except Exception as e:
                log.debug("%s, skipping sym link %s" % (e, fi.path), extra=self.d)
        elif stat.S_ISREG(fi.st_mode):
            self.enq_file(fi)  # where chunking takes place

    def create(self):
        """ Each task has one create(), which is invoked by circle ONCE.
        For FCP, each task will handle_fitem() -> enq_file()
        to process each file gathered during the treewalk stage. """

        if not G.use_store and self.workq:  # restart
            self.setq(self.workq)
            return

        if self.resume:
            return

        # construct and enable all copy operations
        # we batch operation hard-coded
        log.info("create() starts, flist length = %s" % len(self.treewalk.flist),
                    extra=self.d)

        if G.use_store:
            while self.treewalk.flist.qsize > 0:
                fitems, _ = self.treewalk.flist.mget(G.DB_BUFSIZE)
                for fi in fitems:
                    self.handle_fitem(fi)
                self.treewalk.flist.mdel(G.DB_BUFSIZE)

            # store checkpoint
            log.debug("dbname = %s" % self.circle.dbname)
            dirname = os.path.dirname(self.circle.dbname)
            basename = os.path.basename(self.circle.dbname)
            chkpointname = basename + ".CHECK_OK"
            self.checkpoint_file = os.path.join(dirname, chkpointname)
            with open(self.checkpoint_file, "w") as f:
                f.write("%s" % self.totalsize)

        else:  # use memory
            for fi in self.treewalk.flist:
                self.handle_fitem(fi)

            # memory-checkpoint
            if self.checkpoint_file:
                self.do_no_interrupt_checkpoint()
                self.checkpoint_last = MPI.Wtime()

    def do_open(self, k, d, flag, limit):
        """
        @param k: the file path
        @param d: dictionary of <path, file descriptor>
        @return: file descriptor
        """
        if d.has_key(k):
            return d[k]

        if len(d.keys()) >= limit:
            # over the limit
            # clean up the least used
            old_k, old_v = d.items()[-1]
            try:
                os.close(old_v)
            except OSError as e:
                log.warn("FD for %s not valid when closing" % old_k, extra=self.d)

        fd = -1
        try:
            fd = os.open(k, flag)
        except OSError as e:
            if e.errno == 28:  # no space left
                log.error("Critical error: %s, exit!" % e, extra=self.d)
                self.circle.exit(0)  # should abort
            else:
                log.error("OSError({0}):{1}, skipping {2}".format(e.errno, e.strerror, k), extra=self.d)
        else:
            if fd > 0:
                d[k] = fd
        finally:
            return fd

    @staticmethod
    def do_mkdir(work):
        src = work.src
        dest = work.dest
        if not os.path.exists(dest):
            os.makedirs(dest)

    def do_copy(self, work):
        src = work.src
        dest = work.dest

        basedir = os.path.dirname(dest)
        if not os.path.exists(basedir):
            os.makedirs(basedir)

        rfd = self.do_open(src, self.rfd_cache, os.O_RDONLY, self._read_cache_limit)
        if rfd < 0:
            return False
        wfd = self.do_open(dest, self.wfd_cache, os.O_WRONLY | os.O_CREAT, self._write_cache_limit)
        if wfd < 0:
            if args.force:
                try:
                    os.unlink(dest)
                except OSError as e:
                    log.error("Failed to unlink %s, %s " % (dest, e), extra=self.d)
                    return False
                else:
                    wfd = self.do_open(dest, self.wfd_cache, os.O_WRONLY, self._write_cache_limit)
            else:
                log.error("Failed to create output file %s" % dest, extra=self.d)
                return False

        # do the actual copy
        self.write_bytes(rfd, wfd, work)

        # update tally
        self.cnt_filesize += work.length

        if G.verbosity > 2:
            log.debug("Transferred %s bytes from:\n\t [%s] to [%s]" %
                         (self.cnt_filesize, src, dest), extra=self.d)

        return True

    def do_no_interrupt_checkpoint(self):
        a = Thread(target=self.do_checkpoint)
        a.start()
        a.join()
        log.debug("checkpoint: %s" % self.checkpoint_file, extra=self.d)

    def do_checkpoint(self):
        for k in self.wfd_cache.keys():
            os.close(self.wfd_cache[k])

        # clear the cache
        self.wfd_cache.clear()

        tmp_file = self.checkpoint_file + ".part"
        with open(tmp_file, "wb") as f:
            cobj = Checkpoint(self.src, self.dest, self.get_workq(), self.totalsize)
            pickle.dump(cobj, f, pickle.HIGHEST_PROTOCOL)
        # POSIX requires rename to be atomic
        os.rename(tmp_file, self.checkpoint_file)

    def process(self):
        """
        The only work is "copy"
        TODO: clean up other actions such as mkdir/fini_check
        """
        if not G.use_store:
            curtime = MPI.Wtime()
            if curtime - self.checkpoint_last > self.checkpoint_interval:
                self.do_no_interrupt_checkpoint()
                log.info("Checkpointing done ...", extra=self.d)
                self.checkpoint_last = curtime

        work = self.deq()
        self.reduce_items += 1
        if isinstance(work, FileChunk):
            self.do_copy(work)
        else:
            log.warn("Unknown work object: %s" % work, extra=self.d)

    def reduce_init(self, buf):
        buf['cnt_filesize'] = self.cnt_filesize

    def reduce(self, buf1, buf2):
        buf1['cnt_filesize'] += buf2['cnt_filesize']
        return buf1

    def reduce_report(self, buf):
        out = ""
        if self.totalsize != 0:
            out += "%.2f %% finished, " % (100 * float(buf['cnt_filesize']) / self.totalsize)

        out += "%s copied" % bytes_fmt(buf['cnt_filesize'])

        if self.circle.reduce_time_interval != 0:
            rate = float(buf['cnt_filesize'] - self.cnt_filesize_prior) / self.circle.reduce_time_interval
            self.cnt_filesize_prior = buf['cnt_filesize']
            out += ", estimated transfer rate: %s/s" % bytes_fmt(rate)

        print(out)

    def reduce_finish(self, buf):
        # self.reduce_report(buf)
        pass

    def epilogue(self):
        global taskloads
        self.wtime_ended = MPI.Wtime()
        taskloads = self.circle.comm.gather(self.reduce_items)
        if self.circle.rank == 0:
            if self.totalsize == 0:
                print("\nZero filesize detected, done.\n")
                return
            tlapse = self.wtime_ended - self.wtime_started
            rate = float(self.totalsize) / tlapse
            print("\nFCP Epilogue:\n")
            print("\t{:<20}{:<20}".format("Ending at:", utils.current_time()))
            print("\t{:<20}{:<20}".format("Completed in:", utils.conv_time(tlapse)))
            print("\t{:<20}{:<20}".format("Transfer Rate:", "%s/s" % bytes_fmt(rate)))
            print("\t{:<20}{:<20}".format("FCP Loads:", "%s" % taskloads))

    def read_then_write(self, rfd, wfd, work, num_of_bytes, m):
        """ core entry point for copy action: first read then write.

        @param num_of_bytes: the exact amount of bytes we will copy
        @return: False if unsuccessful.

        """
        buf = None
        try:
            buf = readn(rfd, num_of_bytes)
        except IOError:
            self.logger.error("Failed to read %s", work.src, extra=self.d)
            return False

        try:
            writen(wfd, buf)
        except IOError:
            self.logger.error("Failed to write %s", work.dest, extra=self.d)
            return False

        if m:
            m.update(buf)

        return True

    def write_bytes(self, rfd, wfd, work):
        os.lseek(rfd, work.offset, os.SEEK_SET)
        os.lseek(wfd, work.offset, os.SEEK_SET)

        m = None
        if self.verify:
            m = hashlib.sha1()

        remaining = work.length
        while remaining != 0:
            if remaining >= self.blocksize:
                self.read_then_write(rfd, wfd, work, self.blocksize, m)
                remaining -= self.blocksize
            else:
                self.read_then_write(rfd, wfd, work, remaining, m)
                remaining = 0

        if self.verify:
            # use src path here
            ck = ChunkSum(work.src, offset=work.offset, length=work.length,
                          digest=m.hexdigest())
            self.chunksums.append(ck)
Beispiel #16
0
class topic4:
    def __init__(self, c_hash, c_user, c_words):
        self.topic_count =1
        # self.time = (self.first,self.last)
        self.l1 = LRU(c_hash)
        self.first =""
        self.last=""
        self.lats=[]
        self.longs=[]
        self.l2 = LRU(c_user)
        self.l3 = LRU(c_words)
        self.l4 = LRU(400)
    def set_hashLRU(self,l):
        self.set(self.l1, l)

    def set_userLRU(self,l):
        self.set(self.l2, l)

    def set_wordLRU(self,l):
        self.set(self.l3, l)

    def set(self, lru, l):
        for k in l:
            v = lru.get(k,0)
            lru[k]=v+1

    def set_cluster(self, hashtags, users, words,links, cords):
        for k in hashtags:
            self.l1[k]=self.l1.get(k,0)+1
        for k in users:
            self.l2[k]=self.l2.get(k,0)+1
        for k in words:
            self.l3[k]=self.l3.get(k,0)+1
        for k in links:
            self.l4[k]=self.l4.get(k,0)+1
        if(cords is not None):
            self.lats.append(cords["coordinates"][1])
            self.longs.append(cords["coordinates"][0])
        self.topic_count+=1

    def get_similarity(self,hashtags,users,words):
        h_sum = 1
        u_sum = 1
        w_sum = 1
        h_match =0
        h_ind =0
        u_ind =0
        w_ind =0
        c=0
        h1 = self.l1.get_size()
        u1 = self.l2.get_size()
        w1 = self.l3.get_size()
        for h in hashtags:
            # l1_items=zip(*self.l1.items())
            h_sum+= self.l1.get(h,0)
            if(self.l1.has_key(h)):
                ind = self.l1.keys().index(h)
                h_ind+= h1 - ind
                h_match+= 1 if ind<250 else 0
        for u in users:
            u_sum+= self.l2.get(u,0)
            if(self.l2.has_key(u)):
                u_ind+= u1 - self.l2.keys().index(u)
        for w in words:
            w_sum+= self.l3.get(w,0)
            if(self.l3.has_key(w)):
                w_ind+= w1 - self.l3.keys().index(w)
        if(h_match !=0):
            c = h_match -1
        # print(h_ind,h1,u_ind,u1,w_ind,w1, h_sum,w_sum,)
        similarity = (h_ind/(h1+1))*(h_sum/sum(self.l1.values() +[1])) + (u_ind/(u1+1))*(u_sum/sum(self.l2.values()+[1])) + (w_ind/(w1+1))*(w_sum/sum(self.l3.values()+[1])) +c
        return similarity
    def flush1(self, cache, size):
        if(len(cache.keys())>5):
            tokens = reversed(cache.keys()[5])
            cache.clear()
            for i in tokens:
                cache[i]=1


    def flush(self):
        self.flush1(self.l1,500)
        self.flush1(self.l2, 500)
        self.flush1(self.l3,3500)
        self.topic_count=1
Beispiel #17
0
class GelbooruViewer:
    API_URL = "https://gelbooru.com/index.php?page=dapi&s=post&q=index"
    MAX_ID = 1
    MAX_ID_LOCK = Lock()
    MAX_CACHE_SIZE = 32
    MAX_CACHE_TIME = 24 * 60  # minutes
    PICTURES_PER_TAG = 200

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update(
            {
                'Accept': 'application/json, application/xml',
                'Accept-Language': 'en-US',
                'User-Agent': 'Mozilla/5.0 GelbooruViewer/1.0 (+https://github.com/ArchieMeng/GelbooruViewer)'
            }
        )
        # only cache for get_all with tags while pid is 0!!!

        if importlib.find_loader('lru'):
            from lru import LRU
            self.cache = LRU(GelbooruViewer.MAX_CACHE_SIZE)
        else:
            self.cache = dict()

        self.cache_lock = Lock()
        # occasionally update cache
        self.last_cache_used = time()
        self.update_cache_thread = Thread(target=self._update_cache_loop, daemon=True)
        self.update_cache_thread.start()

        # get latest image to update MAX_ID
        self.get(limit=0)

    def _update_cache(self, tags, num=None):
        """
        Do the update cache task
        :param tags: tags of picture to update to cache
        :param num:  amount of pictures
        :return:
        """
        if tags:
            result = [*self.get_all_generator(tags, 0, num, thread_limit=1, limit=100)]
            if result:
                key = '+'.join(tags)
                with self.cache_lock:
                    self.cache[key] = result

    def _update_cache_loop(self):
        """
        Occasionally refresh cache. Clear cache if unused for a long time.
        :return:
        """
        minutes = 2 * 60
        while True:
            sleep(60 * minutes)
            if time() - self.last_cache_used > self.MAX_CACHE_TIME * 60:
                self.cache.clear()
                gc.collect()
                continue
            with self.cache_lock:
                keys = self.cache.keys()
            with ThreadPoolExecutor(max_workers=2) as executor:
                futures = [executor.submit(self._update_cache, key.split('+'), GelbooruViewer.PICTURES_PER_TAG) for key in keys]
                for future in as_completed(futures):
                    try:
                        result = future.result()
                        print(result)
                    except Exception as e:
                        print("Exception happened in GelbooruViewer._update_cache_loop", type(e), e)

    def get_raw_content(self, **kwargs):
        content = None
        with self.session as session:
            response = session.get(GelbooruViewer.API_URL, params=kwargs)
            try:
                content = response.content
            except Exception as e:
                logging.error(str(e))
                pass
        return content

    def get(self, **kwargs)->list:
        """
        use Gelbooru api to fetch picture info.

        :param kwargs: allowed args includes
        limit: How many posts you want to retrieve. There is a hard limit of 100 posts per request.

        pid: The page number.

        cid: Change ID of the post.
        This is in Unix time so there are likely others with the same value if updated at the same time.

        tags: The tags to search for. Any tag combination that works on the web site will work here.
        This includes all the meta-tags. See cheatsheet for more information.

        :return: a list of type GelbooruPicture, if sth wrong happened, a empty list will be return
        """
        attempt = 0
        content = None
        while attempt < 3 and content is None:
            attempt += 1
            content = self.get_raw_content(**kwargs)

        if content is None:
            return []
        if isinstance(content, bytes):
            xml_str = content.decode('utf-8')
        else:
            xml_str = content

        root = ElementTree.fromstring(xml_str)
        posts = root.findall('post')
        picture_list = []

        if posts:
            cur_max_id = int(posts[0].attrib['id'])
            with GelbooruViewer.MAX_ID_LOCK:
                GelbooruViewer.MAX_ID = max(GelbooruViewer.MAX_ID, cur_max_id)
        else:
            return None

        for post in posts:
            info = post.attrib
            picture_list.append(
                GelbooruPicture(
                    info['width'],
                    info['height'],
                    info['score'],
                    info['source'],
                    "https:"+info['preview_url'],
                    "https:"+info['sample_url'],
                    "https:"+info['file_url'],
                    info['created_at'],
                    info['creator_id'],
                    [tag for tag in info['tags'].split(' ') if tag and not tag.isspace()],
                    info['id'],
                    info['rating']
                )
            )
        return picture_list

    def get_all(self, tags: list, pid=0, num=None, thread_limit=5, use_cache=True, limit=25):
        """
        regardless of official request limit amount, use threading to request amount you want

        When pictures is found in cache, list is returned.

        When pictures is found but not in cache, generator is returned.

        Else, None is returned

        :param limit: number of pictures in per request

        :param use_cache: whether prefer internal cache

        :param thread_limit: amount of threads running at the same time

        :param tags: tags must be provided

        :param pid: beginning page id , index from 0

        :param num: num of picture you want.
        This function might return less pictures than u want only if Gelbooru hasn't got enough picture

        :return: a generator of gelboorupicture or list or None

        """
        tags.sort()
        if use_cache and pid == 0:
            with self.cache_lock:
                key = '+'.join(tags)
                if key in self.cache and isinstance(self.cache[key], list):
                    self.last_cache_used = time()
                    if not num:
                        return self.cache[key]
                    else:
                        return self.cache[key][:num]
                elif key not in self.cache or isinstance(self.cache[key], str):
                    self.last_cache_used = time()
                    # only one thread is executed during update. When update executed, a str is put into cache
                    self.cache[key] = "executing"
                    # currently cache size is limited in cate of Memory leak.
                    thread = Thread(
                        target=self._update_cache,
                        args=(tags, GelbooruViewer.PICTURES_PER_TAG),
                        daemon=True
                    )
                    thread.start()

        content = self.get_raw_content(tags=tags, limit=0)
        xml_str = content.decode('utf-8')
        root = ElementTree.fromstring(xml_str)
        try:
            total = int(root.attrib['count'])
        except:
            return None
        if total > 0:
            return self.get_all_generator(tags, pid, num, thread_limit, total, limit)
        else:
            return None

    def get_all_generator(
            self,
            tags: list,
            pid=0,
            num=None,
            thread_limit=5,
            total=None,
            limit=25
    ):
        """
        True function of get all. Generator is returned
        :param thread_limit: max threads to fetch pictures at one time
        :param tags: tags of pictures

        :param pid: beginning page id , index from 0

        :param num: num of picture you want.num of picture you want.
        This function might return less pictures than u want only if Gelbooru hasn't got enough picture

        :param total: total amount of picture, just set None if u don't know it. This is used by internal function
        :param limit: picture number per request.
        Generally, limit=10 cost 1.2s per request, while 25 cost 1.4s, 50 cost 2.2s, 100 cost 2.6s.
        The Larger limit , the faster speed in per request, but larger in total get_all timing.

        :return:
        """
        if limit < 0 or limit > 100:
            limit = 10

        def _get(tags, pid):
            content = self.get_raw_content(tags=tags, limit=limit, pid=pid)
            xml_string = content.decode()
            posts = ElementTree.fromstring(xml_string).findall('post')
            return posts
        if total is None:
            content = self.get_raw_content(tags=tags, limit=0)
            xml_str = content.decode('utf-8')
            root = ElementTree.fromstring(xml_str)
            total = int(root.attrib['count'])
        if isinstance(num, int):
            if num > 0:
                # if total amount is too large, use num instead.
                total = min(total, num)
        if tags and total > 0:
            with ThreadPoolExecutor(max_workers=thread_limit) as executor:
                final_pid = int(total / limit)
                start = pid
                tasks = []
                while start < final_pid + 1:
                    futures2idx = {
                        executor.submit(_get, tags, i): i
                        for i in tasks + [j for j in range(start, min(start + thread_limit, final_pid + 1))]
                    }
                    tasks = []
                    for future in as_completed(futures2idx):
                        idx = futures2idx[future]
                        try:
                            posts = future.result()
                            for post in posts:
                                info = post.attrib
                                yield GelbooruPicture(
                                    info['width'],
                                    info['height'],
                                    info['score'],
                                    info['source'],
                                    "https:" + info['preview_url'],
                                    "https:" + info['sample_url'],
                                    "https:" + info['file_url'],
                                    info['created_at'],
                                    info['creator_id'],
                                    [tag for tag in info['tags'].split(' ') if tag and not tag.isspace()],
                                    info['id'],
                                    info['rating']
                                )
                        except Exception as e:
                            print("GelbooruViewer.get_all_generators raise", type(e), e)
                            tasks.append(idx)
                        start += thread_limit
Beispiel #18
0
class Manager(object):
    def __init__(self):
        '''
        '''
        self._views = LRU(50)
        # tile cache - enough for 1 MFOV for 10 parallel users
        self._tiles = LRU(61 * 10)

        self._client_tiles = {}

    def start(self):
        '''
        '''
        pass

    def check_path_type(self, data_path):
        '''
        Check whether the data_path is a scan, section or fov.
        '''

        # we should check how many levels deep is the IMAGE_COORDINATES_FILE
        # level 0: this is a FOV
        # level 1: this is a section
        # level 2: this is a scan

        if os.path.exists(
                os.path.join(data_path, settings.IMAGE_COORDINATES_FILE)):
            return 'FOV'

        if os.path.exists(
                os.path.join(data_path, Util.get_first_level_subdir(data_path),
                             settings.IMAGE_COORDINATES_FILE)):
            return 'SECTION'

        if os.path.exists(
                os.path.join(data_path,
                             Util.get_second_level_subdir(data_path),
                             settings.IMAGE_COORDINATES_FILE)):
            return 'SCAN'

        return None

    def get_tree(self, data_path):
        '''
        '''

        if not data_path:
            data_path = settings.DEFAULT_DATA_FOLDER

        dir_content = sorted(Util.listdir(data_path))

        dir_listing = []

        for c in dir_content:

            full_url = os.path.join(data_path, c)

            # if not os.path.isdir(full_url):
            #   continue

            entry = {}
            entry['label'] = c
            entry['full_url'] = full_url
            entry['id'] = os.path.join(data_path, c)
            entry['load_on_demand'] = True

            dir_listing.append(entry)

        return dir_listing

    def get_content(self, data_path):
        '''
        Sends the content listing for a given path. This detects if the path is
        scan, section or fov.
        '''

        views = []

        path_type = self.check_path_type(data_path)

        # detect if this is a scan, section or fov
        if path_type == 'FOV':

            views.append({'data_path': data_path})

        elif path_type == 'SECTION':

            views.append({'data_path': data_path})

        elif path_type == 'SCAN':

            scan = Scan.from_directory(data_path, False)  # lazy indexing

            for i, section in enumerate(scan._sections):

                views.append(
                    {'data_path': os.path.join(data_path, section.id)})

        return views

    def get_meta_info(self, data_path):
        '''
        Get meta information for a requested data path.
        '''

        if data_path not in self._views.keys():

            path_type = self.check_path_type(data_path)

            # detect if this is a section or fov
            if path_type == 'FOV':
                # this is a FoV
                fov = FoV.from_directory(data_path, True)

                view = View.create(data_path, [fov], fov._width, fov._height,
                                   fov._tx, fov._ty, self)

            elif path_type == 'SECTION':

                section = Section.from_directory(data_path, True, True)

                view = View.create(data_path, section._fovs, section._width,
                                   section._height, section._tx, section._ty,
                                   self, section._luts64_map)

            #
            # and add to our views dictionary
            #
            self._views[data_path] = view

        else:

            view = self._views[data_path]

        meta_info = {}
        meta_info['width'] = view._width
        meta_info['height'] = view._height
        meta_info['layer'] = 0
        meta_info['minLevel'] = 0
        meta_info['maxLevel'] = 1
        meta_info['tileSize'] = settings.CLIENT_TILE_SIZE
        meta_info['centers'] = view._centers

        return meta_info

    def get_image(self, data_path, x, y, z, w):
        '''
        Calculate which file(s) we need for the current openseadragon tile
        and load them as well as downsample them on the fly.
        '''

        # print '-'*80
        # print 'SD', data_path, x, y, z, w

        if settings.CACHE_CLIENT_TILES:

            osd_file_url = (data_path.replace('/', '_') + '_' + str(x) + '_' +
                            str(y) + '_' + str(z) + '_' + str(w) + '.jpg')
            osd_file_url_full = os.path.join(settings.CLIENT_TILE_CACHE_FOLDER,
                                             osd_file_url)

            if os.path.exists(osd_file_url_full):

                # we have this OSD tile cached on disk
                # print 'OSD CACHE HIT'
                osd_tile = cv2.imread(osd_file_url_full, 0)
                return cv2.imencode('.jpg', osd_tile)[1].tostring()

        view = self._views[data_path]

        # Create an empty dictionary for the View's luts64_map, if there isn't a map
        luts64_map = dict()
        if view._luts64_map is not None:
            luts64_map = view._luts64_map

        # calculate canvas coordinates
        x_c = x * settings.CLIENT_TILE_SIZE
        y_c = y * settings.CLIENT_TILE_SIZE
        w_c = settings.CLIENT_TILE_SIZE
        h_c = settings.CLIENT_TILE_SIZE

        top_left = [x_c, y_c]
        bottom_right = [x_c + w_c, y_c + h_c]

        # loop through all tiles and find ones which match the x_c, y_c, w_c,
        # h_c bounding box
        required_tiles = {}
        for t in view._tiles:
            tile_dict = view._tiles[t]

            tile = tile_dict['tile']
            # now the normalized coordinates which should match the coordinate
            # system
            tx = tile_dict['tx'] / 2**w
            ty = tile_dict['ty'] / 2**w
            width = tile_dict['width'] / 2**w
            height = tile_dict['height'] / 2**w
            t_top_left = [tx, ty]
            t_bottom_right = [tx + width, ty + height]

            comp0 = top_left[0] < t_bottom_right[0]
            comp1 = bottom_right[0] > t_top_left[0]
            comp2 = top_left[1] < t_bottom_right[1]
            comp3 = bottom_right[1] > t_top_left[1]

            overlapping = comp0 and comp1 and comp2 and comp3

            if overlapping:
                required_tiles[t] = tile_dict

        stitched_w = min(view._width / 2**w - x_c, settings.CLIENT_TILE_SIZE)
        stitched_h = min(view._height / 2**w - y_c, settings.CLIENT_TILE_SIZE)

        stitched = np.zeros((stitched_h, stitched_w), dtype=np.uint8)

        if settings.INVERT:
            stitched[:] = 255

        # sort the required tiles to always give priority in the same order
        required_tiles_keys = sorted(required_tiles,
                                     key=lambda key: required_tiles[key])

        for t in required_tiles_keys:

            tile_dict = required_tiles[t]
            tile = tile_dict['tile']

            # fov paths need to be treated differently
            if self.check_path_type(data_path) != 'FOV':
                t_abs_data_path = os.path.join(data_path, tile_dict['fov'])
            else:
                t_abs_data_path = data_path

            # print 'LOADING', os.path.join(t_abs_data_path, tile._filename)
            if t in self._tiles.keys() and w in self._tiles[t]:
                current_tile = self._tiles[t][w]
                # print 'CACHE HIT'
            else:
                #
                # we add to cache
                #
                # print "Loading lut64_map of: {} --> {}".format(tile.id, luts64_map.get(os.path.split(tile.id)[-1].lower(), None))
                tile_img = tile.load(t_abs_data_path,
                                     settings.IMAGE_PREFIX,
                                     lut_base64=luts64_map.get(
                                         os.path.split(tile.id)[-1].lower(),
                                         None))

                current_tile = Manager.downsample_image(tile_img, 2**w)
                self._tiles[t] = {w: current_tile}

            # stitch it in our little openseadragon tile
            tx = tile_dict['tx'] / 2**w
            ty = tile_dict['ty'] / 2**w
            t_width = tile_dict['width'] / 2**w
            t_height = tile_dict['height'] / 2**w

            stitched_x = int(max(tx, top_left[0]) - top_left[0])
            stitched_y = int(max(ty, top_left[1]) - top_left[1])

            stitched_w = int(
                min(t_width - max(top_left[0] - tx, 0),
                    settings.CLIENT_TILE_SIZE - stitched_x))
            stitched_h = int(
                min(t_height - max(top_left[1] - ty, 0),
                    settings.CLIENT_TILE_SIZE - stitched_y))

            t_sub_x = int(max(tx, top_left[0]) - tx)
            t_sub_y = int(max(ty, top_left[1]) - ty)

            stitched[stitched_y:stitched_y + stitched_h,
                     stitched_x:stitched_x +
                     stitched_w] = current_tile[t_sub_y:t_sub_y + stitched_h,
                                                t_sub_x:t_sub_x + stitched_w]

        if settings.INVERT:
            stitched = 255 - stitched

        if settings.CACHE_CLIENT_TILES:
            # print 'Writing OSD tile', osd_file_url_full
            cv2.imwrite(osd_file_url_full, stitched)

        return cv2.imencode('.jpg', stitched)[1].tostring()

    # Helping function
    @staticmethod
    def downsample_image(imagedata, factor):
        '''
        '''
        if factor == 1.:
            return imagedata

        factor = 1. / factor
        return cv2.resize(imagedata, (0, 0),
                          fx=factor,
                          fy=factor,
                          interpolation=cv2.INTER_LINEAR)
Beispiel #19
0
class MemoryStateManager:
    '''
    Meaningless for anyting other than tests
    '''
    def __init__(self, size=10):
        self.size = size
        self._data = LRU(self.size)
        self._locks = {}
        self._canceled = set()
        self.worker_id = uuid.uuid4().hex

    def set_loop(self, loop=None):
        pass

    async def update(self, task_id, data, ttl=None):
        # Updates existing data with new data
        existing = await self.get(task_id)
        existing.update(data)
        self._data[task_id] = existing

    async def get(self, task_id):
        return self._data.get(task_id, {})

    async def exists(self, task_id):
        return task_id in self._data

    async def list(self):
        for task_id in self._data.keys():
            yield task_id

    async def acquire(self, task_id, ttl):
        already_locked = await self.is_locked(task_id)
        if already_locked:
            raise TaskAlreadyAcquired(task_id)

        # Set new lock
        from guillotina_amqp.utils import TimeoutLock
        lock = TimeoutLock(self.worker_id)
        await lock.acquire(ttl=ttl)
        self._locks[task_id] = lock

    async def is_mine(self, task_id):
        if task_id not in self._locks:
            raise TaskNotFoundException(task_id)
        lock = self._locks[task_id]
        return lock.locked() and lock.worker_id == self.worker_id

    async def is_locked(self, task_id):
        if task_id not in self._locks:
            return False
        return self._locks[task_id].locked()

    async def release(self, task_id):
        if not await self.is_mine(task_id):
            # You can't refresh a lock that's not yours
            raise TaskAccessUnauthorized(task_id)
        # Release lock and pop it from data structure
        self._locks[task_id].release()
        self._locks.pop(task_id, None)

    async def refresh_lock(self, task_id, ttl):
        if task_id not in self._locks:
            raise TaskNotFoundException(task_id)

        if not await self.is_locked(task_id):
            raise Exception(f'Task {task_id} is not locked')

        if not await self.is_mine(task_id):
            # You can't refresh a lock that's not yours
            raise TaskAccessUnauthorized(task_id)

        # Refresh
        return await self._locks[task_id].refresh_lock(ttl)

    async def cancel(self, task_id):
        self._canceled.update({task_id})
        return True

    async def cancelation_list(self):
        canceled = copy.deepcopy(self._canceled)
        for task_id in canceled:
            yield task_id

    async def clean_canceled(self, task_id):
        try:
            self._canceled.remove(task_id)
            return True
        except KeyError:
            # Task id wasn't canceled
            return False

    async def is_canceled(self, task_id):
        return task_id in self._canceled

    async def _clean(self):
        self._data = LRU(self.size)
        self._locks = {}
        self._canceled = set()
Beispiel #20
0
print(l.items())  # Prints items in MRU order
# Would print [(4, '4'), (3, '3'), (2, '2'), (1, '1'), (0, '0')]

print(l.peek_first_item(), l.peek_last_item())  #return the MRU key and LRU key
# Would print (4, '4') (0, '0')

l[5] = '5'  # Inserting one more item should evict the old item
print(l.items())
# Would print [(5, '5'), (4, '4'), (3, '3'), (2, '2'), (1, '1')]

l[3]  # Accessing an item would make it MRU
print(l.items())
# Would print [(3, '3'), (5, '5'), (4, '4'), (2, '2'), (1, '1')]
# Now 3 is in front

l.keys()  # Can get keys alone in MRU order
# Would print [3, 5, 4, 2, 1]

del l[4]  # Delete an item
print(l.items())
# Would print [(3, '3'), (5, '5'), (2, '2'), (1, '1')]

print(l.get_size())
# Would print 5

l.set_size(3)
print(l.items())
# Would print [(3, '3'), (5, '5'), (2, '2')]
print(l.get_size())
# Would print 3
print(l.has_key(5))