Beispiel #1
0
    def test_callback(self):

        counter = [0]

        first_key = 'a'
        first_value = 1

        def callback(key, value):
            self.assertEqual(key, first_key)
            self.assertEqual(value, first_value)
            counter[0] += 1

        l = LRU(1, callback=callback)
        l[first_key] = first_value
        l['b'] = 1  # test calling the callback

        self.assertEqual(counter[0], 1)
        self.assertEqual(l.keys(), ['b'])

        l['b'] = 2  # doesn't call callback
        self.assertEqual(counter[0], 1)
        self.assertEqual(l.keys(), ['b'])
        self.assertEqual(l.values(), [2])

        l = LRU(1, callback=callback)
        l[first_key] = first_value

        l.set_callback(None)
        l['c'] = 1  # doesn't call callback
        self.assertEqual(counter[0], 1)
        self.assertEqual(l.keys(), ['c'])

        l.set_callback(callback)
        del l['c']  # doesn't call callback
        self.assertEqual(l.keys(), [])
Beispiel #2
0
class topic4:
    def __init__(self, c_hash, c_user, c_words):
        self.topic_count =1
        self.l1 = LRU(c_hash)
        self.l2 = LRU(c_user)

    def set_hashLRU(self,l):
        self.set(self.l1, l)

    def set_userLRU(self,l):
        self.set(self.l2, l)



    def set(self, lru, l):
        for k in l:
            v = lru.get(k,0)
            lru[k]=v+1

    def set_cluster(self, hashtags, users, words):
        for k in hashtags:
            self.l1[k]=self.l1.get(k,0)+1
        for k in users:
            self.l2[k]=self.l2.get(k,0)+1

        self.topic_count+=1

    def get_similarity(self,hashtags,users,words):
        h_sum = 1
        u_sum = 1
        w_sum = 1
        h_match =0
        h_ind =0
        u_ind =0
        w_ind =0
        c=0
        h1 = self.l1.get_size()
        u1 = self.l2.get_size()
        for h in hashtags:
            # l1_items=zip(*self.l1.items())
            h_sum+= self.l1.get(h,0)
            if(self.l1.has_key(h)):
                ind = self.l1.keys().index(h)
                h_ind+= h1 - ind
                h_match+= 1 if ind<250 else 0
        for u in users:
            u_sum+= self.l2.get(u,0)
            if(self.l2.has_key(u)):
                u_ind+= u1 - self.l2.keys().index(u)

        if(h_match !=0):
            c = h_match -1
        # print(h_ind,h1,u_ind,u1,w_ind,w1, h_sum,w_sum,)
        similarity = (h_ind/(h1+1))*(h_sum/sum(self.l1.values() +[1])) + (u_ind/(u1+1))*(u_sum/sum(self.l2.values()+[1]))  +c
        return similarity
Beispiel #3
0
    def test_callback(self):

        counter = [0]

        first_key = 'a'
        first_value = 1

        def callback(key, value):
            self.assertEqual(key, first_key)
            self.assertEqual(value, first_value)
            counter[0] += 1

        l = LRU(1, callback=callback)
        l[first_key] = first_value
        l['b'] = 1              # test calling the callback

        self.assertEqual(counter[0], 1)
        self.assertEqual(l.keys(), ['b'])

        l['b'] = 2              # doesn't call callback
        self.assertEqual(counter[0], 1)
        self.assertEqual(l.keys(), ['b'])
        self.assertEqual(l.values(), [2])


        l = LRU(1, callback=callback)
        l[first_key] = first_value

        l.set_callback(None)
        l['c'] = 1              # doesn't call callback
        self.assertEqual(counter[0], 1)
        self.assertEqual(l.keys(), ['c'])

        l.set_callback(callback)
        del l['c']              # doesn't call callback
        self.assertEqual(counter[0], 1)
        self.assertEqual(l.keys(), [])

        l = LRU(2, callback=callback)
        l['a'] = 1              # test calling the callback
        l['b'] = 2              # test calling the callback

        self.assertEqual(counter[0], 1)
        self.assertEqual(l.keys(), ['b', 'a'])
        l.set_size(1)
        self.assertEqual(counter[0], 2) # callback invoked
        self.assertEqual(l.keys(), ['b'])
Beispiel #4
0
 def test_empty(self):
     l = LRU(1)
     self.assertEquals([], l.keys())
     self.assertEquals([], l.values())
Beispiel #5
0
 def test_empty(self):
     l = LRU(1)
     self.assertEqual([], l.keys())
     self.assertEqual([], l.values())
Beispiel #6
0
class StarboardEntries:
    """A way of managing starboard entries.
    Sort of like an ORM, but also not fully."""

    _pool: asyncpg.Pool = attr.ib()
    # note: entry cache isn't really a dict, but for typehinting purposes this works
    _entry_cache: typing.Dict[int, StarboardEntry] = attr.ib()
    _sql_loop_task: asyncio.Task = attr.ib()
    _sql_queries: cclass.SetUpdateAsyncQueue = attr.ib()

    def __init__(self, pool: asyncpg.Pool, cache_size: int = 200):
        self._pool = pool
        self._entry_cache = LRU(
            cache_size
        )  # the 200 should be raised as the bot grows bigger
        self._sql_queries = cclass.SetUpdateAsyncQueue()

        loop = asyncio.get_event_loop()
        self._sql_loop_task = loop.create_task(self._sql_loop())

    def stop(self):
        """Stops the SQL task loop."""
        self._sql_loop_task.cancel()

    async def _sql_loop(self):
        """Actually runs SQL updating, hopefully one after another.

        Saves speed on adding, deleting, and updating by offloading
        this step here."""
        try:
            while True:
                entry = await self._sql_queries.get()
                logging.getLogger("discord").debug(f"Running {entry.query}.")
                await self._pool.execute(entry.query, timeout=60, *entry.args)
                self._sql_queries.task_done()
        except asyncio.CancelledError:
            pass

    def _get_required_from_entry(self, entry: StarboardEntry):
        """Transforms data into the form needed for databases."""
        return (
            entry.ori_mes_id,
            entry.ori_chan_id,
            entry.star_var_id,
            entry.starboard_id,
            entry.author_id,
            list(entry.ori_reactors),
            list(entry.var_reactors),
            entry.guild_id,
            entry.forced,
            entry.frozen,
            entry.trashed,
        )

    def _str_builder_to_insert(
        self, str_builder: typing.List[str], entry: StarboardEntry
    ):
        """Takes data from a string builder list and eventually
        puts the data needed into the _sql_queries variable."""
        query = "".join(str_builder)
        args = self._get_required_from_entry(entry)
        self._sql_queries.put_nowait(StarboardSQLEntry(query, args))

    def _handle_upsert(self, entry: StarboardEntry):
        """Upserts an entry by using an INSERT with an ON CONFLICT cause.
        This is a PostgreSQL-specific feature, so that's nice!"""
        str_builder = [
            "INSERT INTO starboard(ori_mes_id, ori_chan_id, star_var_id, ",
            "starboard_id, author_id, ori_reactors, var_reactors, ",
            "guild_id, forced, frozen, trashed) VALUES($1, $2, $3, $4, ",
            "$5, $6, $7, $8, $9, $10, $11) ON CONFLICT (ori_mes_id) DO UPDATE ",
            "SET ori_chan_id = $2, star_var_id = $3, starboard_id = $4, ",
            "author_id = $5, ori_reactors = $6, var_reactors = $7, guild_id = $8, ",
            "forced = $9, frozen = $10, trashed = $11",
        ]
        self._str_builder_to_insert(str_builder, entry)

    def upsert(self, entry: StarboardEntry):
        """Either adds or updates an entry in the collection of entries."""
        temp_dict = {entry.ori_mes_id: entry}
        if entry.star_var_id:
            temp_dict[entry.star_var_id] = entry

        self._entry_cache.update(**temp_dict)  # type: ignore this is valid i promise
        self._handle_upsert(entry)

    def delete(self, entry_id: int):
        """Removes an entry from the collection of entries."""
        self._entry_cache.pop(entry_id, None)
        self._sql_queries.put_nowait(
            StarboardSQLEntry("DELETE FROM starboard WHERE ori_mes_id = $1", [entry_id])
        )

    async def get(
        self, entry_id: int, check_for_var: bool = False
    ) -> typing.Optional[StarboardEntry]:
        """Gets an entry from the collection of entries."""
        entry = None

        if self._entry_cache.has_key(entry_id):  # type: ignore
            entry = self._entry_cache[entry_id]
        else:
            entry = discord.utils.find(
                lambda e: e and e.star_var_id == entry_id, self._entry_cache.values()
            )

        if not entry:
            async with self._pool.acquire() as conn:
                data = await conn.fetchrow(
                    f"SELECT * FROM starboard WHERE ori_mes_id = {entry_id} OR"
                    f" star_var_id = {entry_id}"
                )
                if data:
                    entry = StarboardEntry.from_row(data)
                    self._entry_cache[entry_id] = entry

        if entry and check_for_var and not entry.star_var_id:
            return None

        return entry

    async def select_query(self, query: str):
        """Selects the starboard database directly for entries based on the query."""
        async with self._pool.acquire() as conn:
            data = await conn.fetch(f"SELECT * FROM starboard WHERE {query}")

            if not data:
                return None
            return tuple(StarboardEntry.from_row(row) for row in data)

    async def raw_query(self, query: str):
        """Runs the raw query against the pool, assuming the results are starboard entries."""
        async with self._pool.acquire() as conn:
            data = await conn.fetch(query)

            if not data:
                return None
            return tuple(StarboardEntry.from_row(row) for row in data)

    async def super_raw_query(self, query: str):
        """You want a raw query? You'll get one."""
        async with self._pool.acquire() as conn:
            return await conn.fetch(query)

    async def query_entries(
        self, seperator: str = "AND", **conditions: typing.Dict[str, str]
    ) -> typing.Optional[typing.Tuple[StarboardEntry, ...]]:
        """Queries entries based on conditions provided.

        For example, you could do `query_entries(guild_id=143425)` to get
        entries with that guild id."""
        sql_conditions: list[str] = [
            f"{key} = {value}" for key, value in conditions.items()
        ]
        combined_statements = f" {seperator} ".join(sql_conditions)

        async with self._pool.acquire() as conn:
            data = await conn.fetch(
                f"SELECT * FROM starboard WHERE {combined_statements}"
            )

            if not data:
                return None
            return tuple(StarboardEntry.from_row(row) for row in data)

    async def get_random(self, guild_id: int) -> typing.Optional[StarboardEntry]:
        """Gets a random entry from a guild."""
        # query adapted from
        # https://github.com/Rapptz/RoboDanny/blob/1fb95d76d1b7685e2e2ff950e11cddfc96efbfec/cogs/stars.py#L1082
        query = """SELECT *
                   FROM starboard
                   WHERE guild_id=$1
                   AND star_var_id IS NOT NULL
                   OFFSET FLOOR(RANDOM() * (
                       SELECT COUNT(*)
                       FROM starboard
                       WHERE guild_id=$1
                       AND star_var_id IS NOT NULL
                   ))
                   LIMIT 1
                """

        async with self._pool.acquire() as conn:
            data = await conn.fetchrow(query, guild_id)
            if not data:
                return None
            return StarboardEntry.from_row(data)
Beispiel #7
0
class topic4:
    def __init__(self, c_hash, c_user, c_words):
        self.topic_count =1
        # self.time = (self.first,self.last)
        self.l1 = LRU(c_hash)
        self.first =""
        self.last=""
        self.lats=[]
        self.longs=[]
        self.l2 = LRU(c_user)
        self.l3 = LRU(c_words)
        self.l4 = LRU(400)
    def set_hashLRU(self,l):
        self.set(self.l1, l)

    def set_userLRU(self,l):
        self.set(self.l2, l)

    def set_wordLRU(self,l):
        self.set(self.l3, l)

    def set(self, lru, l):
        for k in l:
            v = lru.get(k,0)
            lru[k]=v+1

    def set_cluster(self, hashtags, users, words,links, cords):
        for k in hashtags:
            self.l1[k]=self.l1.get(k,0)+1
        for k in users:
            self.l2[k]=self.l2.get(k,0)+1
        for k in words:
            self.l3[k]=self.l3.get(k,0)+1
        for k in links:
            self.l4[k]=self.l4.get(k,0)+1
        if(cords is not None):
            self.lats.append(cords["coordinates"][1])
            self.longs.append(cords["coordinates"][0])
        self.topic_count+=1

    def get_similarity(self,hashtags,users,words):
        h_sum = 1
        u_sum = 1
        w_sum = 1
        h_match =0
        h_ind =0
        u_ind =0
        w_ind =0
        c=0
        h1 = self.l1.get_size()
        u1 = self.l2.get_size()
        w1 = self.l3.get_size()
        for h in hashtags:
            # l1_items=zip(*self.l1.items())
            h_sum+= self.l1.get(h,0)
            if(self.l1.has_key(h)):
                ind = self.l1.keys().index(h)
                h_ind+= h1 - ind
                h_match+= 1 if ind<250 else 0
        for u in users:
            u_sum+= self.l2.get(u,0)
            if(self.l2.has_key(u)):
                u_ind+= u1 - self.l2.keys().index(u)
        for w in words:
            w_sum+= self.l3.get(w,0)
            if(self.l3.has_key(w)):
                w_ind+= w1 - self.l3.keys().index(w)
        if(h_match !=0):
            c = h_match -1
        # print(h_ind,h1,u_ind,u1,w_ind,w1, h_sum,w_sum,)
        similarity = (h_ind/(h1+1))*(h_sum/sum(self.l1.values() +[1])) + (u_ind/(u1+1))*(u_sum/sum(self.l2.values()+[1])) + (w_ind/(w1+1))*(w_sum/sum(self.l3.values()+[1])) +c
        return similarity
    def flush1(self, cache, size):
        if(len(cache.keys())>5):
            tokens = reversed(cache.keys()[5])
            cache.clear()
            for i in tokens:
                cache[i]=1


    def flush(self):
        self.flush1(self.l1,500)
        self.flush1(self.l2, 500)
        self.flush1(self.l3,3500)
        self.topic_count=1
Beispiel #8
0
class FCP(BaseTask):
    def __init__(self, circle, src, dest,
                 treewalk=None,
                 totalsize=0,
                 hostcnt=0,
                 prune=False,
                 verify=False,
                 resume=False,
                 workq=None):
        BaseTask.__init__(self, circle)
        self.circle = circle
        self.treewalk = treewalk
        self.totalsize = totalsize
        self.prune = prune
        self.workq = workq
        self.resume = resume
        self.checkpoint_file = None
        self.src = src
        self.dest = os.path.abspath(dest)

        # cache, keep the size conservative
        # TODO: we need a more portable LRU size

        if hostcnt != 0:
            max_ofile, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
            procs_per_host = self.circle.size / hostcnt
            self._read_cache_limit = ((max_ofile - 64) / procs_per_host) / 3
            self._write_cache_limit = ((max_ofile - 64) / procs_per_host) * 2 / 3

        if self._read_cache_limit <= 0 or self._write_cache_limit <= 0:
            self._read_cache_limit = 1
            self._write_cache_limit = 8

        self.rfd_cache = LRU(self._read_cache_limit)
        self.wfd_cache = LRU(self._write_cache_limit)

        self.cnt_filesize_prior = 0
        self.cnt_filesize = 0

        self.blocksize = 1024 * 1024
        self.chunksize = 1024 * 1024

        # debug
        self.d = {"rank": "rank %s" % circle.rank}
        self.wtime_started = MPI.Wtime()
        self.wtime_ended = None
        self.workcnt = 0  # this is the cnt for the enqued items
        self.reduce_items = 0  # this is the cnt for processed items
        if self.treewalk:
            log.debug("treewalk files = %s" % treewalk.flist, extra=self.d)

        # fini_check
        self.fini_cnt = Counter()

        # verify
        self.verify = verify
        self.chunksums = []

        # checkpointing
        self.checkpoint_interval = sys.maxsize
        self.checkpoint_last = MPI.Wtime()

        if self.circle.rank == 0:
            print("Start copying process ...")

    def rw_cache_limit(self):
        return (self._read_cache_limit, self._write_cache_limit)

    def set_fixed_chunksize(self, sz):
        self.chunksize = sz

    def set_adaptive_chunksize(self, totalsz):
        self.chunksize = utils.calc_chunksize(totalsz)
        if self.circle.rank == 0:
            print("Adaptive chunksize: %s" % bytes_fmt(self.chunksize))

    def cleanup(self):
        for f in self.rfd_cache.values():
            try:
                os.close(f)
            except OSError as e:
                pass

        for f in self.wfd_cache.values():
            try:
                os.close(f)
            except OSError as e:
                pass

        # remove checkpoint file
        if self.checkpoint_file and os.path.exists(self.checkpoint_file):
            os.remove(self.checkpoint_file)

        # we need to do this because if last job didn't finish cleanly
        # the fwalk files can be found as leftovers
        # and if fcp cleanup has a chance, it should clean up that

        fwalk = "%s/fwalk.%s" % (self.circle.tempdir, self.circle.rank)
        if os.path.exists(fwalk):
            os.remove(fwalk)

    def new_fchunk(self, fitem):
        fchunk = FileChunk()  # default cmd = copy
        fchunk.src = fitem.path
        fchunk.dest = destpath(fitem, self.dest)
        return fchunk

    def enq_file(self, fi):
        """ Process a single file, represented by "fi" - FileItem
        It involves chunking this file and equeue all chunks. """

        chunks = fi.st_size / self.chunksize
        remaining = fi.st_size % self.chunksize

        workcnt = 0

        if fi.st_size == 0:  # empty file
            fchunk = self.new_fchunk(fi)
            fchunk.offset = 0
            fchunk.length = 0
            self.enq(fchunk)
            workcnt += 1
        else:
            for i in range(chunks):
                fchunk = self.new_fchunk(fi)
                fchunk.offset = i * self.chunksize
                fchunk.length = self.chunksize
                self.enq(fchunk)
            workcnt += chunks

        if remaining > 0:
            # send remainder
            fchunk = self.new_fchunk(fi)
            fchunk.offset = chunks * self.chunksize
            fchunk.length = remaining
            self.enq(fchunk)
            workcnt += 1

        # save work cnt
        self.workcnt += workcnt

        log.debug("enq_file(): %s, size = %s, workcnt = %s" % (fi.path, fi.st_size, workcnt),
                     extra=self.d)

    def handle_fitem(self, fi):
        if os.path.islink(fi.path):
            dest = destpath(fi, self.dest)
            linkto = os.readlink(fi.path)
            try:
                os.symlink(linkto, dest)
            except Exception as e:
                log.debug("%s, skipping sym link %s" % (e, fi.path), extra=self.d)
        elif stat.S_ISREG(fi.st_mode):
            self.enq_file(fi)  # where chunking takes place

    def create(self):
        """ Each task has one create(), which is invoked by circle ONCE.
        For FCP, each task will handle_fitem() -> enq_file()
        to process each file gathered during the treewalk stage. """

        if not G.use_store and self.workq:  # restart
            self.setq(self.workq)
            return

        if self.resume:
            return

        # construct and enable all copy operations
        # we batch operation hard-coded
        log.info("create() starts, flist length = %s" % len(self.treewalk.flist),
                    extra=self.d)

        if G.use_store:
            while self.treewalk.flist.qsize > 0:
                fitems, _ = self.treewalk.flist.mget(G.DB_BUFSIZE)
                for fi in fitems:
                    self.handle_fitem(fi)
                self.treewalk.flist.mdel(G.DB_BUFSIZE)

            # store checkpoint
            log.debug("dbname = %s" % self.circle.dbname)
            dirname = os.path.dirname(self.circle.dbname)
            basename = os.path.basename(self.circle.dbname)
            chkpointname = basename + ".CHECK_OK"
            self.checkpoint_file = os.path.join(dirname, chkpointname)
            with open(self.checkpoint_file, "w") as f:
                f.write("%s" % self.totalsize)

        else:  # use memory
            for fi in self.treewalk.flist:
                self.handle_fitem(fi)

            # memory-checkpoint
            if self.checkpoint_file:
                self.do_no_interrupt_checkpoint()
                self.checkpoint_last = MPI.Wtime()

    def do_open(self, k, d, flag, limit):
        """
        @param k: the file path
        @param d: dictionary of <path, file descriptor>
        @return: file descriptor
        """
        if d.has_key(k):
            return d[k]

        if len(d.keys()) >= limit:
            # over the limit
            # clean up the least used
            old_k, old_v = d.items()[-1]
            try:
                os.close(old_v)
            except OSError as e:
                log.warn("FD for %s not valid when closing" % old_k, extra=self.d)

        fd = -1
        try:
            fd = os.open(k, flag)
        except OSError as e:
            if e.errno == 28:  # no space left
                log.error("Critical error: %s, exit!" % e, extra=self.d)
                self.circle.exit(0)  # should abort
            else:
                log.error("OSError({0}):{1}, skipping {2}".format(e.errno, e.strerror, k), extra=self.d)
        else:
            if fd > 0:
                d[k] = fd
        finally:
            return fd

    @staticmethod
    def do_mkdir(work):
        src = work.src
        dest = work.dest
        if not os.path.exists(dest):
            os.makedirs(dest)

    def do_copy(self, work):
        src = work.src
        dest = work.dest

        basedir = os.path.dirname(dest)
        if not os.path.exists(basedir):
            os.makedirs(basedir)

        rfd = self.do_open(src, self.rfd_cache, os.O_RDONLY, self._read_cache_limit)
        if rfd < 0:
            return False
        wfd = self.do_open(dest, self.wfd_cache, os.O_WRONLY | os.O_CREAT, self._write_cache_limit)
        if wfd < 0:
            if args.force:
                try:
                    os.unlink(dest)
                except OSError as e:
                    log.error("Failed to unlink %s, %s " % (dest, e), extra=self.d)
                    return False
                else:
                    wfd = self.do_open(dest, self.wfd_cache, os.O_WRONLY, self._write_cache_limit)
            else:
                log.error("Failed to create output file %s" % dest, extra=self.d)
                return False

        # do the actual copy
        self.write_bytes(rfd, wfd, work)

        # update tally
        self.cnt_filesize += work.length

        if G.verbosity > 2:
            log.debug("Transferred %s bytes from:\n\t [%s] to [%s]" %
                         (self.cnt_filesize, src, dest), extra=self.d)

        return True

    def do_no_interrupt_checkpoint(self):
        a = Thread(target=self.do_checkpoint)
        a.start()
        a.join()
        log.debug("checkpoint: %s" % self.checkpoint_file, extra=self.d)

    def do_checkpoint(self):
        for k in self.wfd_cache.keys():
            os.close(self.wfd_cache[k])

        # clear the cache
        self.wfd_cache.clear()

        tmp_file = self.checkpoint_file + ".part"
        with open(tmp_file, "wb") as f:
            cobj = Checkpoint(self.src, self.dest, self.get_workq(), self.totalsize)
            pickle.dump(cobj, f, pickle.HIGHEST_PROTOCOL)
        # POSIX requires rename to be atomic
        os.rename(tmp_file, self.checkpoint_file)

    def process(self):
        """
        The only work is "copy"
        TODO: clean up other actions such as mkdir/fini_check
        """
        if not G.use_store:
            curtime = MPI.Wtime()
            if curtime - self.checkpoint_last > self.checkpoint_interval:
                self.do_no_interrupt_checkpoint()
                log.info("Checkpointing done ...", extra=self.d)
                self.checkpoint_last = curtime

        work = self.deq()
        self.reduce_items += 1
        if isinstance(work, FileChunk):
            self.do_copy(work)
        else:
            log.warn("Unknown work object: %s" % work, extra=self.d)

    def reduce_init(self, buf):
        buf['cnt_filesize'] = self.cnt_filesize

    def reduce(self, buf1, buf2):
        buf1['cnt_filesize'] += buf2['cnt_filesize']
        return buf1

    def reduce_report(self, buf):
        out = ""
        if self.totalsize != 0:
            out += "%.2f %% finished, " % (100 * float(buf['cnt_filesize']) / self.totalsize)

        out += "%s copied" % bytes_fmt(buf['cnt_filesize'])

        if self.circle.reduce_time_interval != 0:
            rate = float(buf['cnt_filesize'] - self.cnt_filesize_prior) / self.circle.reduce_time_interval
            self.cnt_filesize_prior = buf['cnt_filesize']
            out += ", estimated transfer rate: %s/s" % bytes_fmt(rate)

        print(out)

    def reduce_finish(self, buf):
        # self.reduce_report(buf)
        pass

    def epilogue(self):
        global taskloads
        self.wtime_ended = MPI.Wtime()
        taskloads = self.circle.comm.gather(self.reduce_items)
        if self.circle.rank == 0:
            if self.totalsize == 0:
                print("\nZero filesize detected, done.\n")
                return
            tlapse = self.wtime_ended - self.wtime_started
            rate = float(self.totalsize) / tlapse
            print("\nFCP Epilogue:\n")
            print("\t{:<20}{:<20}".format("Ending at:", utils.current_time()))
            print("\t{:<20}{:<20}".format("Completed in:", utils.conv_time(tlapse)))
            print("\t{:<20}{:<20}".format("Transfer Rate:", "%s/s" % bytes_fmt(rate)))
            print("\t{:<20}{:<20}".format("FCP Loads:", "%s" % taskloads))

    def read_then_write(self, rfd, wfd, work, num_of_bytes, m):
        """ core entry point for copy action: first read then write.

        @param num_of_bytes: the exact amount of bytes we will copy
        @return: False if unsuccessful.

        """
        buf = None
        try:
            buf = readn(rfd, num_of_bytes)
        except IOError:
            self.logger.error("Failed to read %s", work.src, extra=self.d)
            return False

        try:
            writen(wfd, buf)
        except IOError:
            self.logger.error("Failed to write %s", work.dest, extra=self.d)
            return False

        if m:
            m.update(buf)

        return True

    def write_bytes(self, rfd, wfd, work):
        os.lseek(rfd, work.offset, os.SEEK_SET)
        os.lseek(wfd, work.offset, os.SEEK_SET)

        m = None
        if self.verify:
            m = hashlib.sha1()

        remaining = work.length
        while remaining != 0:
            if remaining >= self.blocksize:
                self.read_then_write(rfd, wfd, work, self.blocksize, m)
                remaining -= self.blocksize
            else:
                self.read_then_write(rfd, wfd, work, remaining, m)
                remaining = 0

        if self.verify:
            # use src path here
            ck = ChunkSum(work.src, offset=work.offset, length=work.length,
                          digest=m.hexdigest())
            self.chunksums.append(ck)
Beispiel #9
0
class FCP(BaseTask):
    def __init__(self, circle, src, dest,
                 treewalk=None,
                 totalsize=0,
                 hostcnt=0,
                 prune=False,
                 verify=False,
                 resume=False,
                 workq=None):
        BaseTask.__init__(self, circle)
        self.circle = circle
        self.treewalk = treewalk
        self.totalsize = totalsize
        self.prune = prune
        self.workq = workq
        self.resume = resume
        self.checkpoint_file = None
        self.checkpoint_db = None
        self.src = src
        self.dest = os.path.abspath(dest)

        # cache, keep the size conservative
        # TODO: we need a more portable LRU size

        if hostcnt != 0:
            max_ofile, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
            procs_per_host = self.circle.size / hostcnt
            self._read_cache_limit = ((max_ofile - 64) / procs_per_host) / 3
            self._write_cache_limit = ((max_ofile - 64) / procs_per_host) * 2 / 3

        if self._read_cache_limit <= 0 or self._write_cache_limit <= 0:
            self._read_cache_limit = 1
            self._write_cache_limit = 8

        self.rfd_cache = LRU(self._read_cache_limit)
        self.wfd_cache = LRU(self._write_cache_limit)

        self.cnt_filesize_prior = 0
        self.cnt_filesize = 0

        self.blocksize = 1024 * 1024
        self.chunksize = 1024 * 1024

        # debug
        self.d = {"rank": "rank %s" % circle.rank}
        self.wtime_started = MPI.Wtime()
        self.wtime_ended = None
        self.workcnt = 0  # this is the cnt for the enqued items
        self.reduce_items = 0  # this is the cnt for processed items
        if self.treewalk:
            log.debug("treewalk files = %s" % treewalk.flist, extra=self.d)

        # fini_check
        self.fini_cnt = Counter()

        # verify
        self.verify = verify
        self.use_store = False
        if self.verify:
            self.chunksums_mem = []
            self.chunksums_buf = []

        # checkpointing
        self.checkpoint_interval = sys.maxsize
        self.checkpoint_last = MPI.Wtime()

        if self.circle.rank == 0:
            print("Start copying process ...")

    def rw_cache_limit(self):
        return (self._read_cache_limit, self._write_cache_limit)

    def set_fixed_chunksize(self, sz):
        self.chunksize = sz

    def set_adaptive_chunksize(self, totalsz):
        self.chunksize = utils.calc_chunksize(totalsz)
        if self.circle.rank == 0:
            print("Adaptive chunksize: %s" % bytes_fmt(self.chunksize))

    def cleanup(self):
        for f in self.rfd_cache.values():
            try:
                os.close(f)
            except OSError as e:
                pass

        for f in self.wfd_cache.values():
            try:
                os.close(f)
            except OSError as e:
                pass

        # remove checkpoint file
        if self.checkpoint_file and os.path.exists(self.checkpoint_file):
            os.remove(self.checkpoint_file)
        if self.checkpoint_db and os.path.exists(self.checkpoint_db):
            os.remove(self.checkpoint_db)

        # remove provided checkpoint file
        if G.resume and G.chk_file and os.path.exists(G.chk_file):
            os.remove(G.chk_file)
        if G.resume and G.chk_file_db and os.path.exists(G.chk_file_db):
            os.remove(G.chk_file_db)

        # remove chunksums file
        if self.verify:
            if hasattr(self, "chunksums_db"):
                self.chunksums_db.cleanup()

        # we need to do this because if last job didn't finish cleanly
        # the fwalk files can be found as leftovers
        # and if fcp cleanup has a chance, it should clean up that
        """
        fwalk = "%s/fwalk.%s" % (G.tempdir, self.circle.rank)
        if os.path.exists(fwalk):
            os.remove(fwalk)
        """

    def new_fchunk(self, fitem):
        fchunk = FileChunk()  # default cmd = copy
        fchunk.src = fitem.path
        fchunk.dest = destpath(fitem, self.dest)
        return fchunk

    def enq_file(self, fi):
        """ Process a single file, represented by "fi" - FileItem
        It involves chunking this file and equeue all chunks. """

        chunks = fi.st_size / self.chunksize
        remaining = fi.st_size % self.chunksize

        workcnt = 0

        if fi.st_size == 0:  # empty file
            fchunk = self.new_fchunk(fi)
            fchunk.offset = 0
            fchunk.length = 0
            self.enq(fchunk)
            workcnt += 1
        else:
            for i in range(chunks):
                fchunk = self.new_fchunk(fi)
                fchunk.offset = i * self.chunksize
                fchunk.length = self.chunksize
                self.enq(fchunk)
            workcnt += chunks

        if remaining > 0:
            # send remainder
            fchunk = self.new_fchunk(fi)
            fchunk.offset = chunks * self.chunksize
            fchunk.length = remaining
            self.enq(fchunk)
            workcnt += 1

        # save work cnt
        self.workcnt += workcnt

        log.debug("enq_file(): %s, size = %s, workcnt = %s" % (fi.path, fi.st_size, workcnt),
                     extra=self.d)

    def handle_fitem(self, fi):
        if os.path.islink(fi.path):
            dest = destpath(fi, self.dest)
            linkto = os.readlink(fi.path)
            try:
                os.symlink(linkto, dest)
            except Exception as e:
                log.debug("%s, skipping sym link %s" % (e, fi.path), extra=self.d)
        elif stat.S_ISREG(fi.st_mode):
            self.enq_file(fi)  # where chunking takes place

    def create(self):
        """ Each task has one create(), which is invoked by circle ONCE.
        For FCP, each task will handle_fitem() -> enq_file()
        to process each file gathered during the treewalk stage. """

        if not G.use_store and self.workq:  # restart
            self.setq(self.workq)
            return

        if self.resume:
            return

        # construct and enable all copy operations
        # we batch operation hard-coded
        log.info("create() starts, flist length = %s" % len(self.treewalk.flist),
                    extra=self.d)

        # flist in memory
        if len(self.treewalk.flist) > 0:
            for fi in self.treewalk.flist:
                self.handle_fitem(fi)

        # flist in buf
        if len(self.treewalk.flist_buf) > 0:
            for fi in self.treewalk.flist_buf:
                self.handle_fitem(fi)

        # flist in database
        if self.treewalk.use_store:
            while self.treewalk.flist_db.qsize > 0:
                fitems, _ = self.treewalk.flist_db.mget(G.DB_BUFSIZE)
                for fi in fitems:
                    self.handle_fitem(fi)
                self.treewalk.flist_db.mdel(G.DB_BUFSIZE)

        # both memory and databse checkpoint
        if self.checkpoint_file:
            self.do_no_interrupt_checkpoint()
            self.checkpoint_last = MPI.Wtime()

        # gather total_chunks
        self.circle.comm.barrier()
        G.total_chunks = self.circle.comm.allreduce(self.workcnt, op=MPI.SUM)
        #G.total_chunks = self.circle.comm.bcast(G.total_chunks)
        #print("Total chunks: ",G.total_chunks)

    def do_open(self, k, d, flag, limit):
        """
        @param k: the file path
        @param d: dictionary of <path, file descriptor>
        @return: file descriptor
        """
        if d.has_key(k):
            return d[k]

        if len(d.keys()) >= limit:
            # over the limit
            # clean up the least used
            old_k, old_v = d.items()[-1]
            try:
                os.close(old_v)
            except OSError as e:
                log.warn("FD for %s not valid when closing" % old_k, extra=self.d)

        fd = -1
        try:
            fd = os.open(k, flag)
        except OSError as e:
            if e.errno == 28:  # no space left
                log.error("Critical error: %s, exit!" % e, extra=self.d)
                self.circle.exit(0)  # should abort
            else:
                log.error("OSError({0}):{1}, skipping {2}".format(e.errno, e.strerror, k), extra=self.d)
        else:
            if fd > 0:
                d[k] = fd
        finally:
            return fd

    @staticmethod
    def do_mkdir(work):
        src = work.src
        dest = work.dest
        if not os.path.exists(dest):
            os.makedirs(dest)

    def do_copy(self, work):
        src = work.src
        dest = work.dest

        basedir = os.path.dirname(dest)
        if not os.path.exists(basedir):
            os.makedirs(basedir)

        rfd = self.do_open(src, self.rfd_cache, os.O_RDONLY, self._read_cache_limit)
        if rfd < 0:
            return False
        wfd = self.do_open(dest, self.wfd_cache, os.O_WRONLY | os.O_CREAT, self._write_cache_limit)
        if wfd < 0:
            if args.force:
                try:
                    os.unlink(dest)
                except OSError as e:
                    log.error("Failed to unlink %s, %s " % (dest, e), extra=self.d)
                    return False
                else:
                    wfd = self.do_open(dest, self.wfd_cache, os.O_WRONLY, self._write_cache_limit)
            else:
                log.error("Failed to create output file %s" % dest, extra=self.d)
                return False

        # do the actual copy
        self.write_bytes(rfd, wfd, work)

        # update tally
        self.cnt_filesize += work.length

        if G.verbosity > 2:
            log.debug("Transferred %s bytes from:\n\t [%s] to [%s]" %
                         (self.cnt_filesize, src, dest), extra=self.d)

        return True

    def do_no_interrupt_checkpoint(self):
        a = Thread(target=self.do_checkpoint)
        a.start()
        a.join()
        log.debug("checkpoint: %s" % self.checkpoint_file, extra=self.d)
        print("\nMake checkpoint files: ", self.checkpoint_file)

    def do_checkpoint(self):
        # when make checkpoint, first write workq and workq_buf into checkpoint file, then make a copy of workq_db if it exists
        for k in self.wfd_cache.keys():
            os.close(self.wfd_cache[k])

        # clear the cache
        self.wfd_cache.clear()

        tmp_file = self.checkpoint_file + ".part"
        with open(tmp_file, "wb") as f:
            self.circle.workq.extend(self.circle.workq_buf)
            self.circle.workq_buf.clear()
            cobj = Checkpoint(self.src, self.dest, self.get_workq(), self.totalsize)
            pickle.dump(cobj, f, pickle.HIGHEST_PROTOCOL)
        # POSIX requires rename to be atomic
        os.rename(tmp_file, self.checkpoint_file)

        # copy workq_db database file
        if hasattr(self.circle, "workq_db") and len(self.circle.workq_db) > 0:
            self.checkpoint_db = self.checkpoint_file + ".db"
            if not G.resume:
                shutil.copy2(self.circle.dbname, self.checkpoint_db)
            else:
                # in resume mode, make a copy of current workq db file, which is provided checkpoint db file
                self.workdir = os.getcwd()
                existingCheckpoint = os.path.join(self.workdir,".pcp_workq.%s.%s.db" % (G.rid, self.circle.rank))
                shutil.copy2(existingCheckpoint,self.checkpoint_db)

    def process(self):
        """
        The only work is "copy"
        TODO: clean up other actions such as mkdir/fini_check
        """
        if not G.use_store:
            curtime = MPI.Wtime()
            if curtime - self.checkpoint_last > self.checkpoint_interval:
                self.do_no_interrupt_checkpoint()
                log.info("Checkpointing done ...", extra=self.d)
                self.checkpoint_last = curtime

        work = self.deq()
        self.reduce_items += 1
        if isinstance(work, FileChunk):
            self.do_copy(work)
        else:
            log.warn("Unknown work object: %s" % work, extra=self.d)
            err_and_exit("Not a correct workq format")

    def reduce_init(self, buf):
        buf['cnt_filesize'] = self.cnt_filesize
        if sys.platform == 'darwin':
            buf['mem_snapshot'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        else:
            buf['mem_snapshot'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024

    def reduce(self, buf1, buf2):
        buf1['cnt_filesize'] += buf2['cnt_filesize']
        buf1['mem_snapshot'] += buf2['mem_snapshot']
        return buf1

    def reduce_report(self, buf):
        out = ""
        if self.totalsize != 0:
            out += "%.2f %% finished, " % (100 * float(buf['cnt_filesize']) / self.totalsize)

        out += "%s copied" % bytes_fmt(buf['cnt_filesize'])

        if self.circle.reduce_time_interval != 0:
            rate = float(buf['cnt_filesize'] - self.cnt_filesize_prior) / self.circle.reduce_time_interval
            self.cnt_filesize_prior = buf['cnt_filesize']
            out += ", estimated transfer rate: %s/s" % bytes_fmt(rate)

        out += ", memory usage: %s" % bytes_fmt(buf['mem_snapshot'])
        print(out)

    def reduce_finish(self, buf):
        # self.reduce_report(buf)
        pass

    def epilogue(self):
        global taskloads
        self.wtime_ended = MPI.Wtime()
        taskloads = self.circle.comm.gather(self.reduce_items)
        if self.circle.rank == 0:
            if self.totalsize == 0:
                print("\nZero filesize detected, done.\n")
                return
            tlapse = self.wtime_ended - self.wtime_started
            rate = float(self.totalsize) / tlapse
            print("\nFCP Epilogue:\n")
            print("\t{:<20}{:<20}".format("Ending at:", utils.current_time()))
            print("\t{:<20}{:<20}".format("Completed in:", utils.conv_time(tlapse)))
            print("\t{:<20}{:<20}".format("Transfer Rate:", "%s/s" % bytes_fmt(rate)))
            print("\t{:<20}{:<20}".format("Use store chunksums:", "%s" % self.use_store))
            print("\t{:<20}{:<20}".format("Use store workq:", "%s" % self.circle.use_store))
            print("\t{:<20}{:<20}".format("FCP Loads:", "%s" % taskloads))

    def read_then_write(self, rfd, wfd, work, num_of_bytes, m):
        """ core entry point for copy action: first read then write.

        @param num_of_bytes: the exact amount of bytes we will copy
        @return: False if unsuccessful.

        """
        buf = None
        try:
            buf = readn(rfd, num_of_bytes)
        except IOError:
            self.logger.error("Failed to read %s", work.src, extra=self.d)
            return False

        try:
            writen(wfd, buf)
        except IOError:
            self.logger.error("Failed to write %s", work.dest, extra=self.d)
            return False

        if m:
            m.update(buf)

        return True

    def write_bytes(self, rfd, wfd, work):
        os.lseek(rfd, work.offset, os.SEEK_SET)
        os.lseek(wfd, work.offset, os.SEEK_SET)

        m = None
        if self.verify:
            m = hashlib.sha1()

        remaining = work.length
        while remaining != 0:
            if remaining >= self.blocksize:
                self.read_then_write(rfd, wfd, work, self.blocksize, m)
                remaining -= self.blocksize
            else:
                self.read_then_write(rfd, wfd, work, remaining, m)
                remaining = 0

        if self.verify:
            # use src path here
            ck = ChunkSum(work.dest, offset=work.offset, length=work.length,
                          digest=m.hexdigest())

            if len(self.chunksums_mem) < G.memitem_threshold:
                self.chunksums_mem.append(ck)
            else:
                self.chunksums_buf.append(ck)
                if len(self.chunksums_buf) == G.DB_BUFSIZE:
                    if self.use_store == False:
                        self.workdir = os.getcwd()
                        self.chunksums_dbname = "%s/chunksums.%s" % (G.tempdir, self.circle.rank)
                        self.chunksums_db = DbStore(dbname=self.chunksums_dbname)
                        self.use_store = True
                    self.chunksums_db.mput(self.chunksums_buf)
                    del self.chunksums_buf[:]