def test_callback(self): counter = [0] first_key = 'a' first_value = 1 def callback(key, value): self.assertEqual(key, first_key) self.assertEqual(value, first_value) counter[0] += 1 l = LRU(1, callback=callback) l[first_key] = first_value l['b'] = 1 # test calling the callback self.assertEqual(counter[0], 1) self.assertEqual(l.keys(), ['b']) l['b'] = 2 # doesn't call callback self.assertEqual(counter[0], 1) self.assertEqual(l.keys(), ['b']) self.assertEqual(l.values(), [2]) l = LRU(1, callback=callback) l[first_key] = first_value l.set_callback(None) l['c'] = 1 # doesn't call callback self.assertEqual(counter[0], 1) self.assertEqual(l.keys(), ['c']) l.set_callback(callback) del l['c'] # doesn't call callback self.assertEqual(l.keys(), [])
class topic4: def __init__(self, c_hash, c_user, c_words): self.topic_count =1 self.l1 = LRU(c_hash) self.l2 = LRU(c_user) def set_hashLRU(self,l): self.set(self.l1, l) def set_userLRU(self,l): self.set(self.l2, l) def set(self, lru, l): for k in l: v = lru.get(k,0) lru[k]=v+1 def set_cluster(self, hashtags, users, words): for k in hashtags: self.l1[k]=self.l1.get(k,0)+1 for k in users: self.l2[k]=self.l2.get(k,0)+1 self.topic_count+=1 def get_similarity(self,hashtags,users,words): h_sum = 1 u_sum = 1 w_sum = 1 h_match =0 h_ind =0 u_ind =0 w_ind =0 c=0 h1 = self.l1.get_size() u1 = self.l2.get_size() for h in hashtags: # l1_items=zip(*self.l1.items()) h_sum+= self.l1.get(h,0) if(self.l1.has_key(h)): ind = self.l1.keys().index(h) h_ind+= h1 - ind h_match+= 1 if ind<250 else 0 for u in users: u_sum+= self.l2.get(u,0) if(self.l2.has_key(u)): u_ind+= u1 - self.l2.keys().index(u) if(h_match !=0): c = h_match -1 # print(h_ind,h1,u_ind,u1,w_ind,w1, h_sum,w_sum,) similarity = (h_ind/(h1+1))*(h_sum/sum(self.l1.values() +[1])) + (u_ind/(u1+1))*(u_sum/sum(self.l2.values()+[1])) +c return similarity
def test_callback(self): counter = [0] first_key = 'a' first_value = 1 def callback(key, value): self.assertEqual(key, first_key) self.assertEqual(value, first_value) counter[0] += 1 l = LRU(1, callback=callback) l[first_key] = first_value l['b'] = 1 # test calling the callback self.assertEqual(counter[0], 1) self.assertEqual(l.keys(), ['b']) l['b'] = 2 # doesn't call callback self.assertEqual(counter[0], 1) self.assertEqual(l.keys(), ['b']) self.assertEqual(l.values(), [2]) l = LRU(1, callback=callback) l[first_key] = first_value l.set_callback(None) l['c'] = 1 # doesn't call callback self.assertEqual(counter[0], 1) self.assertEqual(l.keys(), ['c']) l.set_callback(callback) del l['c'] # doesn't call callback self.assertEqual(counter[0], 1) self.assertEqual(l.keys(), []) l = LRU(2, callback=callback) l['a'] = 1 # test calling the callback l['b'] = 2 # test calling the callback self.assertEqual(counter[0], 1) self.assertEqual(l.keys(), ['b', 'a']) l.set_size(1) self.assertEqual(counter[0], 2) # callback invoked self.assertEqual(l.keys(), ['b'])
def test_empty(self): l = LRU(1) self.assertEquals([], l.keys()) self.assertEquals([], l.values())
def test_empty(self): l = LRU(1) self.assertEqual([], l.keys()) self.assertEqual([], l.values())
class StarboardEntries: """A way of managing starboard entries. Sort of like an ORM, but also not fully.""" _pool: asyncpg.Pool = attr.ib() # note: entry cache isn't really a dict, but for typehinting purposes this works _entry_cache: typing.Dict[int, StarboardEntry] = attr.ib() _sql_loop_task: asyncio.Task = attr.ib() _sql_queries: cclass.SetUpdateAsyncQueue = attr.ib() def __init__(self, pool: asyncpg.Pool, cache_size: int = 200): self._pool = pool self._entry_cache = LRU( cache_size ) # the 200 should be raised as the bot grows bigger self._sql_queries = cclass.SetUpdateAsyncQueue() loop = asyncio.get_event_loop() self._sql_loop_task = loop.create_task(self._sql_loop()) def stop(self): """Stops the SQL task loop.""" self._sql_loop_task.cancel() async def _sql_loop(self): """Actually runs SQL updating, hopefully one after another. Saves speed on adding, deleting, and updating by offloading this step here.""" try: while True: entry = await self._sql_queries.get() logging.getLogger("discord").debug(f"Running {entry.query}.") await self._pool.execute(entry.query, timeout=60, *entry.args) self._sql_queries.task_done() except asyncio.CancelledError: pass def _get_required_from_entry(self, entry: StarboardEntry): """Transforms data into the form needed for databases.""" return ( entry.ori_mes_id, entry.ori_chan_id, entry.star_var_id, entry.starboard_id, entry.author_id, list(entry.ori_reactors), list(entry.var_reactors), entry.guild_id, entry.forced, entry.frozen, entry.trashed, ) def _str_builder_to_insert( self, str_builder: typing.List[str], entry: StarboardEntry ): """Takes data from a string builder list and eventually puts the data needed into the _sql_queries variable.""" query = "".join(str_builder) args = self._get_required_from_entry(entry) self._sql_queries.put_nowait(StarboardSQLEntry(query, args)) def _handle_upsert(self, entry: StarboardEntry): """Upserts an entry by using an INSERT with an ON CONFLICT cause. This is a PostgreSQL-specific feature, so that's nice!""" str_builder = [ "INSERT INTO starboard(ori_mes_id, ori_chan_id, star_var_id, ", "starboard_id, author_id, ori_reactors, var_reactors, ", "guild_id, forced, frozen, trashed) VALUES($1, $2, $3, $4, ", "$5, $6, $7, $8, $9, $10, $11) ON CONFLICT (ori_mes_id) DO UPDATE ", "SET ori_chan_id = $2, star_var_id = $3, starboard_id = $4, ", "author_id = $5, ori_reactors = $6, var_reactors = $7, guild_id = $8, ", "forced = $9, frozen = $10, trashed = $11", ] self._str_builder_to_insert(str_builder, entry) def upsert(self, entry: StarboardEntry): """Either adds or updates an entry in the collection of entries.""" temp_dict = {entry.ori_mes_id: entry} if entry.star_var_id: temp_dict[entry.star_var_id] = entry self._entry_cache.update(**temp_dict) # type: ignore this is valid i promise self._handle_upsert(entry) def delete(self, entry_id: int): """Removes an entry from the collection of entries.""" self._entry_cache.pop(entry_id, None) self._sql_queries.put_nowait( StarboardSQLEntry("DELETE FROM starboard WHERE ori_mes_id = $1", [entry_id]) ) async def get( self, entry_id: int, check_for_var: bool = False ) -> typing.Optional[StarboardEntry]: """Gets an entry from the collection of entries.""" entry = None if self._entry_cache.has_key(entry_id): # type: ignore entry = self._entry_cache[entry_id] else: entry = discord.utils.find( lambda e: e and e.star_var_id == entry_id, self._entry_cache.values() ) if not entry: async with self._pool.acquire() as conn: data = await conn.fetchrow( f"SELECT * FROM starboard WHERE ori_mes_id = {entry_id} OR" f" star_var_id = {entry_id}" ) if data: entry = StarboardEntry.from_row(data) self._entry_cache[entry_id] = entry if entry and check_for_var and not entry.star_var_id: return None return entry async def select_query(self, query: str): """Selects the starboard database directly for entries based on the query.""" async with self._pool.acquire() as conn: data = await conn.fetch(f"SELECT * FROM starboard WHERE {query}") if not data: return None return tuple(StarboardEntry.from_row(row) for row in data) async def raw_query(self, query: str): """Runs the raw query against the pool, assuming the results are starboard entries.""" async with self._pool.acquire() as conn: data = await conn.fetch(query) if not data: return None return tuple(StarboardEntry.from_row(row) for row in data) async def super_raw_query(self, query: str): """You want a raw query? You'll get one.""" async with self._pool.acquire() as conn: return await conn.fetch(query) async def query_entries( self, seperator: str = "AND", **conditions: typing.Dict[str, str] ) -> typing.Optional[typing.Tuple[StarboardEntry, ...]]: """Queries entries based on conditions provided. For example, you could do `query_entries(guild_id=143425)` to get entries with that guild id.""" sql_conditions: list[str] = [ f"{key} = {value}" for key, value in conditions.items() ] combined_statements = f" {seperator} ".join(sql_conditions) async with self._pool.acquire() as conn: data = await conn.fetch( f"SELECT * FROM starboard WHERE {combined_statements}" ) if not data: return None return tuple(StarboardEntry.from_row(row) for row in data) async def get_random(self, guild_id: int) -> typing.Optional[StarboardEntry]: """Gets a random entry from a guild.""" # query adapted from # https://github.com/Rapptz/RoboDanny/blob/1fb95d76d1b7685e2e2ff950e11cddfc96efbfec/cogs/stars.py#L1082 query = """SELECT * FROM starboard WHERE guild_id=$1 AND star_var_id IS NOT NULL OFFSET FLOOR(RANDOM() * ( SELECT COUNT(*) FROM starboard WHERE guild_id=$1 AND star_var_id IS NOT NULL )) LIMIT 1 """ async with self._pool.acquire() as conn: data = await conn.fetchrow(query, guild_id) if not data: return None return StarboardEntry.from_row(data)
class topic4: def __init__(self, c_hash, c_user, c_words): self.topic_count =1 # self.time = (self.first,self.last) self.l1 = LRU(c_hash) self.first ="" self.last="" self.lats=[] self.longs=[] self.l2 = LRU(c_user) self.l3 = LRU(c_words) self.l4 = LRU(400) def set_hashLRU(self,l): self.set(self.l1, l) def set_userLRU(self,l): self.set(self.l2, l) def set_wordLRU(self,l): self.set(self.l3, l) def set(self, lru, l): for k in l: v = lru.get(k,0) lru[k]=v+1 def set_cluster(self, hashtags, users, words,links, cords): for k in hashtags: self.l1[k]=self.l1.get(k,0)+1 for k in users: self.l2[k]=self.l2.get(k,0)+1 for k in words: self.l3[k]=self.l3.get(k,0)+1 for k in links: self.l4[k]=self.l4.get(k,0)+1 if(cords is not None): self.lats.append(cords["coordinates"][1]) self.longs.append(cords["coordinates"][0]) self.topic_count+=1 def get_similarity(self,hashtags,users,words): h_sum = 1 u_sum = 1 w_sum = 1 h_match =0 h_ind =0 u_ind =0 w_ind =0 c=0 h1 = self.l1.get_size() u1 = self.l2.get_size() w1 = self.l3.get_size() for h in hashtags: # l1_items=zip(*self.l1.items()) h_sum+= self.l1.get(h,0) if(self.l1.has_key(h)): ind = self.l1.keys().index(h) h_ind+= h1 - ind h_match+= 1 if ind<250 else 0 for u in users: u_sum+= self.l2.get(u,0) if(self.l2.has_key(u)): u_ind+= u1 - self.l2.keys().index(u) for w in words: w_sum+= self.l3.get(w,0) if(self.l3.has_key(w)): w_ind+= w1 - self.l3.keys().index(w) if(h_match !=0): c = h_match -1 # print(h_ind,h1,u_ind,u1,w_ind,w1, h_sum,w_sum,) similarity = (h_ind/(h1+1))*(h_sum/sum(self.l1.values() +[1])) + (u_ind/(u1+1))*(u_sum/sum(self.l2.values()+[1])) + (w_ind/(w1+1))*(w_sum/sum(self.l3.values()+[1])) +c return similarity def flush1(self, cache, size): if(len(cache.keys())>5): tokens = reversed(cache.keys()[5]) cache.clear() for i in tokens: cache[i]=1 def flush(self): self.flush1(self.l1,500) self.flush1(self.l2, 500) self.flush1(self.l3,3500) self.topic_count=1
class FCP(BaseTask): def __init__(self, circle, src, dest, treewalk=None, totalsize=0, hostcnt=0, prune=False, verify=False, resume=False, workq=None): BaseTask.__init__(self, circle) self.circle = circle self.treewalk = treewalk self.totalsize = totalsize self.prune = prune self.workq = workq self.resume = resume self.checkpoint_file = None self.src = src self.dest = os.path.abspath(dest) # cache, keep the size conservative # TODO: we need a more portable LRU size if hostcnt != 0: max_ofile, _ = resource.getrlimit(resource.RLIMIT_NOFILE) procs_per_host = self.circle.size / hostcnt self._read_cache_limit = ((max_ofile - 64) / procs_per_host) / 3 self._write_cache_limit = ((max_ofile - 64) / procs_per_host) * 2 / 3 if self._read_cache_limit <= 0 or self._write_cache_limit <= 0: self._read_cache_limit = 1 self._write_cache_limit = 8 self.rfd_cache = LRU(self._read_cache_limit) self.wfd_cache = LRU(self._write_cache_limit) self.cnt_filesize_prior = 0 self.cnt_filesize = 0 self.blocksize = 1024 * 1024 self.chunksize = 1024 * 1024 # debug self.d = {"rank": "rank %s" % circle.rank} self.wtime_started = MPI.Wtime() self.wtime_ended = None self.workcnt = 0 # this is the cnt for the enqued items self.reduce_items = 0 # this is the cnt for processed items if self.treewalk: log.debug("treewalk files = %s" % treewalk.flist, extra=self.d) # fini_check self.fini_cnt = Counter() # verify self.verify = verify self.chunksums = [] # checkpointing self.checkpoint_interval = sys.maxsize self.checkpoint_last = MPI.Wtime() if self.circle.rank == 0: print("Start copying process ...") def rw_cache_limit(self): return (self._read_cache_limit, self._write_cache_limit) def set_fixed_chunksize(self, sz): self.chunksize = sz def set_adaptive_chunksize(self, totalsz): self.chunksize = utils.calc_chunksize(totalsz) if self.circle.rank == 0: print("Adaptive chunksize: %s" % bytes_fmt(self.chunksize)) def cleanup(self): for f in self.rfd_cache.values(): try: os.close(f) except OSError as e: pass for f in self.wfd_cache.values(): try: os.close(f) except OSError as e: pass # remove checkpoint file if self.checkpoint_file and os.path.exists(self.checkpoint_file): os.remove(self.checkpoint_file) # we need to do this because if last job didn't finish cleanly # the fwalk files can be found as leftovers # and if fcp cleanup has a chance, it should clean up that fwalk = "%s/fwalk.%s" % (self.circle.tempdir, self.circle.rank) if os.path.exists(fwalk): os.remove(fwalk) def new_fchunk(self, fitem): fchunk = FileChunk() # default cmd = copy fchunk.src = fitem.path fchunk.dest = destpath(fitem, self.dest) return fchunk def enq_file(self, fi): """ Process a single file, represented by "fi" - FileItem It involves chunking this file and equeue all chunks. """ chunks = fi.st_size / self.chunksize remaining = fi.st_size % self.chunksize workcnt = 0 if fi.st_size == 0: # empty file fchunk = self.new_fchunk(fi) fchunk.offset = 0 fchunk.length = 0 self.enq(fchunk) workcnt += 1 else: for i in range(chunks): fchunk = self.new_fchunk(fi) fchunk.offset = i * self.chunksize fchunk.length = self.chunksize self.enq(fchunk) workcnt += chunks if remaining > 0: # send remainder fchunk = self.new_fchunk(fi) fchunk.offset = chunks * self.chunksize fchunk.length = remaining self.enq(fchunk) workcnt += 1 # save work cnt self.workcnt += workcnt log.debug("enq_file(): %s, size = %s, workcnt = %s" % (fi.path, fi.st_size, workcnt), extra=self.d) def handle_fitem(self, fi): if os.path.islink(fi.path): dest = destpath(fi, self.dest) linkto = os.readlink(fi.path) try: os.symlink(linkto, dest) except Exception as e: log.debug("%s, skipping sym link %s" % (e, fi.path), extra=self.d) elif stat.S_ISREG(fi.st_mode): self.enq_file(fi) # where chunking takes place def create(self): """ Each task has one create(), which is invoked by circle ONCE. For FCP, each task will handle_fitem() -> enq_file() to process each file gathered during the treewalk stage. """ if not G.use_store and self.workq: # restart self.setq(self.workq) return if self.resume: return # construct and enable all copy operations # we batch operation hard-coded log.info("create() starts, flist length = %s" % len(self.treewalk.flist), extra=self.d) if G.use_store: while self.treewalk.flist.qsize > 0: fitems, _ = self.treewalk.flist.mget(G.DB_BUFSIZE) for fi in fitems: self.handle_fitem(fi) self.treewalk.flist.mdel(G.DB_BUFSIZE) # store checkpoint log.debug("dbname = %s" % self.circle.dbname) dirname = os.path.dirname(self.circle.dbname) basename = os.path.basename(self.circle.dbname) chkpointname = basename + ".CHECK_OK" self.checkpoint_file = os.path.join(dirname, chkpointname) with open(self.checkpoint_file, "w") as f: f.write("%s" % self.totalsize) else: # use memory for fi in self.treewalk.flist: self.handle_fitem(fi) # memory-checkpoint if self.checkpoint_file: self.do_no_interrupt_checkpoint() self.checkpoint_last = MPI.Wtime() def do_open(self, k, d, flag, limit): """ @param k: the file path @param d: dictionary of <path, file descriptor> @return: file descriptor """ if d.has_key(k): return d[k] if len(d.keys()) >= limit: # over the limit # clean up the least used old_k, old_v = d.items()[-1] try: os.close(old_v) except OSError as e: log.warn("FD for %s not valid when closing" % old_k, extra=self.d) fd = -1 try: fd = os.open(k, flag) except OSError as e: if e.errno == 28: # no space left log.error("Critical error: %s, exit!" % e, extra=self.d) self.circle.exit(0) # should abort else: log.error("OSError({0}):{1}, skipping {2}".format(e.errno, e.strerror, k), extra=self.d) else: if fd > 0: d[k] = fd finally: return fd @staticmethod def do_mkdir(work): src = work.src dest = work.dest if not os.path.exists(dest): os.makedirs(dest) def do_copy(self, work): src = work.src dest = work.dest basedir = os.path.dirname(dest) if not os.path.exists(basedir): os.makedirs(basedir) rfd = self.do_open(src, self.rfd_cache, os.O_RDONLY, self._read_cache_limit) if rfd < 0: return False wfd = self.do_open(dest, self.wfd_cache, os.O_WRONLY | os.O_CREAT, self._write_cache_limit) if wfd < 0: if args.force: try: os.unlink(dest) except OSError as e: log.error("Failed to unlink %s, %s " % (dest, e), extra=self.d) return False else: wfd = self.do_open(dest, self.wfd_cache, os.O_WRONLY, self._write_cache_limit) else: log.error("Failed to create output file %s" % dest, extra=self.d) return False # do the actual copy self.write_bytes(rfd, wfd, work) # update tally self.cnt_filesize += work.length if G.verbosity > 2: log.debug("Transferred %s bytes from:\n\t [%s] to [%s]" % (self.cnt_filesize, src, dest), extra=self.d) return True def do_no_interrupt_checkpoint(self): a = Thread(target=self.do_checkpoint) a.start() a.join() log.debug("checkpoint: %s" % self.checkpoint_file, extra=self.d) def do_checkpoint(self): for k in self.wfd_cache.keys(): os.close(self.wfd_cache[k]) # clear the cache self.wfd_cache.clear() tmp_file = self.checkpoint_file + ".part" with open(tmp_file, "wb") as f: cobj = Checkpoint(self.src, self.dest, self.get_workq(), self.totalsize) pickle.dump(cobj, f, pickle.HIGHEST_PROTOCOL) # POSIX requires rename to be atomic os.rename(tmp_file, self.checkpoint_file) def process(self): """ The only work is "copy" TODO: clean up other actions such as mkdir/fini_check """ if not G.use_store: curtime = MPI.Wtime() if curtime - self.checkpoint_last > self.checkpoint_interval: self.do_no_interrupt_checkpoint() log.info("Checkpointing done ...", extra=self.d) self.checkpoint_last = curtime work = self.deq() self.reduce_items += 1 if isinstance(work, FileChunk): self.do_copy(work) else: log.warn("Unknown work object: %s" % work, extra=self.d) def reduce_init(self, buf): buf['cnt_filesize'] = self.cnt_filesize def reduce(self, buf1, buf2): buf1['cnt_filesize'] += buf2['cnt_filesize'] return buf1 def reduce_report(self, buf): out = "" if self.totalsize != 0: out += "%.2f %% finished, " % (100 * float(buf['cnt_filesize']) / self.totalsize) out += "%s copied" % bytes_fmt(buf['cnt_filesize']) if self.circle.reduce_time_interval != 0: rate = float(buf['cnt_filesize'] - self.cnt_filesize_prior) / self.circle.reduce_time_interval self.cnt_filesize_prior = buf['cnt_filesize'] out += ", estimated transfer rate: %s/s" % bytes_fmt(rate) print(out) def reduce_finish(self, buf): # self.reduce_report(buf) pass def epilogue(self): global taskloads self.wtime_ended = MPI.Wtime() taskloads = self.circle.comm.gather(self.reduce_items) if self.circle.rank == 0: if self.totalsize == 0: print("\nZero filesize detected, done.\n") return tlapse = self.wtime_ended - self.wtime_started rate = float(self.totalsize) / tlapse print("\nFCP Epilogue:\n") print("\t{:<20}{:<20}".format("Ending at:", utils.current_time())) print("\t{:<20}{:<20}".format("Completed in:", utils.conv_time(tlapse))) print("\t{:<20}{:<20}".format("Transfer Rate:", "%s/s" % bytes_fmt(rate))) print("\t{:<20}{:<20}".format("FCP Loads:", "%s" % taskloads)) def read_then_write(self, rfd, wfd, work, num_of_bytes, m): """ core entry point for copy action: first read then write. @param num_of_bytes: the exact amount of bytes we will copy @return: False if unsuccessful. """ buf = None try: buf = readn(rfd, num_of_bytes) except IOError: self.logger.error("Failed to read %s", work.src, extra=self.d) return False try: writen(wfd, buf) except IOError: self.logger.error("Failed to write %s", work.dest, extra=self.d) return False if m: m.update(buf) return True def write_bytes(self, rfd, wfd, work): os.lseek(rfd, work.offset, os.SEEK_SET) os.lseek(wfd, work.offset, os.SEEK_SET) m = None if self.verify: m = hashlib.sha1() remaining = work.length while remaining != 0: if remaining >= self.blocksize: self.read_then_write(rfd, wfd, work, self.blocksize, m) remaining -= self.blocksize else: self.read_then_write(rfd, wfd, work, remaining, m) remaining = 0 if self.verify: # use src path here ck = ChunkSum(work.src, offset=work.offset, length=work.length, digest=m.hexdigest()) self.chunksums.append(ck)
class FCP(BaseTask): def __init__(self, circle, src, dest, treewalk=None, totalsize=0, hostcnt=0, prune=False, verify=False, resume=False, workq=None): BaseTask.__init__(self, circle) self.circle = circle self.treewalk = treewalk self.totalsize = totalsize self.prune = prune self.workq = workq self.resume = resume self.checkpoint_file = None self.checkpoint_db = None self.src = src self.dest = os.path.abspath(dest) # cache, keep the size conservative # TODO: we need a more portable LRU size if hostcnt != 0: max_ofile, _ = resource.getrlimit(resource.RLIMIT_NOFILE) procs_per_host = self.circle.size / hostcnt self._read_cache_limit = ((max_ofile - 64) / procs_per_host) / 3 self._write_cache_limit = ((max_ofile - 64) / procs_per_host) * 2 / 3 if self._read_cache_limit <= 0 or self._write_cache_limit <= 0: self._read_cache_limit = 1 self._write_cache_limit = 8 self.rfd_cache = LRU(self._read_cache_limit) self.wfd_cache = LRU(self._write_cache_limit) self.cnt_filesize_prior = 0 self.cnt_filesize = 0 self.blocksize = 1024 * 1024 self.chunksize = 1024 * 1024 # debug self.d = {"rank": "rank %s" % circle.rank} self.wtime_started = MPI.Wtime() self.wtime_ended = None self.workcnt = 0 # this is the cnt for the enqued items self.reduce_items = 0 # this is the cnt for processed items if self.treewalk: log.debug("treewalk files = %s" % treewalk.flist, extra=self.d) # fini_check self.fini_cnt = Counter() # verify self.verify = verify self.use_store = False if self.verify: self.chunksums_mem = [] self.chunksums_buf = [] # checkpointing self.checkpoint_interval = sys.maxsize self.checkpoint_last = MPI.Wtime() if self.circle.rank == 0: print("Start copying process ...") def rw_cache_limit(self): return (self._read_cache_limit, self._write_cache_limit) def set_fixed_chunksize(self, sz): self.chunksize = sz def set_adaptive_chunksize(self, totalsz): self.chunksize = utils.calc_chunksize(totalsz) if self.circle.rank == 0: print("Adaptive chunksize: %s" % bytes_fmt(self.chunksize)) def cleanup(self): for f in self.rfd_cache.values(): try: os.close(f) except OSError as e: pass for f in self.wfd_cache.values(): try: os.close(f) except OSError as e: pass # remove checkpoint file if self.checkpoint_file and os.path.exists(self.checkpoint_file): os.remove(self.checkpoint_file) if self.checkpoint_db and os.path.exists(self.checkpoint_db): os.remove(self.checkpoint_db) # remove provided checkpoint file if G.resume and G.chk_file and os.path.exists(G.chk_file): os.remove(G.chk_file) if G.resume and G.chk_file_db and os.path.exists(G.chk_file_db): os.remove(G.chk_file_db) # remove chunksums file if self.verify: if hasattr(self, "chunksums_db"): self.chunksums_db.cleanup() # we need to do this because if last job didn't finish cleanly # the fwalk files can be found as leftovers # and if fcp cleanup has a chance, it should clean up that """ fwalk = "%s/fwalk.%s" % (G.tempdir, self.circle.rank) if os.path.exists(fwalk): os.remove(fwalk) """ def new_fchunk(self, fitem): fchunk = FileChunk() # default cmd = copy fchunk.src = fitem.path fchunk.dest = destpath(fitem, self.dest) return fchunk def enq_file(self, fi): """ Process a single file, represented by "fi" - FileItem It involves chunking this file and equeue all chunks. """ chunks = fi.st_size / self.chunksize remaining = fi.st_size % self.chunksize workcnt = 0 if fi.st_size == 0: # empty file fchunk = self.new_fchunk(fi) fchunk.offset = 0 fchunk.length = 0 self.enq(fchunk) workcnt += 1 else: for i in range(chunks): fchunk = self.new_fchunk(fi) fchunk.offset = i * self.chunksize fchunk.length = self.chunksize self.enq(fchunk) workcnt += chunks if remaining > 0: # send remainder fchunk = self.new_fchunk(fi) fchunk.offset = chunks * self.chunksize fchunk.length = remaining self.enq(fchunk) workcnt += 1 # save work cnt self.workcnt += workcnt log.debug("enq_file(): %s, size = %s, workcnt = %s" % (fi.path, fi.st_size, workcnt), extra=self.d) def handle_fitem(self, fi): if os.path.islink(fi.path): dest = destpath(fi, self.dest) linkto = os.readlink(fi.path) try: os.symlink(linkto, dest) except Exception as e: log.debug("%s, skipping sym link %s" % (e, fi.path), extra=self.d) elif stat.S_ISREG(fi.st_mode): self.enq_file(fi) # where chunking takes place def create(self): """ Each task has one create(), which is invoked by circle ONCE. For FCP, each task will handle_fitem() -> enq_file() to process each file gathered during the treewalk stage. """ if not G.use_store and self.workq: # restart self.setq(self.workq) return if self.resume: return # construct and enable all copy operations # we batch operation hard-coded log.info("create() starts, flist length = %s" % len(self.treewalk.flist), extra=self.d) # flist in memory if len(self.treewalk.flist) > 0: for fi in self.treewalk.flist: self.handle_fitem(fi) # flist in buf if len(self.treewalk.flist_buf) > 0: for fi in self.treewalk.flist_buf: self.handle_fitem(fi) # flist in database if self.treewalk.use_store: while self.treewalk.flist_db.qsize > 0: fitems, _ = self.treewalk.flist_db.mget(G.DB_BUFSIZE) for fi in fitems: self.handle_fitem(fi) self.treewalk.flist_db.mdel(G.DB_BUFSIZE) # both memory and databse checkpoint if self.checkpoint_file: self.do_no_interrupt_checkpoint() self.checkpoint_last = MPI.Wtime() # gather total_chunks self.circle.comm.barrier() G.total_chunks = self.circle.comm.allreduce(self.workcnt, op=MPI.SUM) #G.total_chunks = self.circle.comm.bcast(G.total_chunks) #print("Total chunks: ",G.total_chunks) def do_open(self, k, d, flag, limit): """ @param k: the file path @param d: dictionary of <path, file descriptor> @return: file descriptor """ if d.has_key(k): return d[k] if len(d.keys()) >= limit: # over the limit # clean up the least used old_k, old_v = d.items()[-1] try: os.close(old_v) except OSError as e: log.warn("FD for %s not valid when closing" % old_k, extra=self.d) fd = -1 try: fd = os.open(k, flag) except OSError as e: if e.errno == 28: # no space left log.error("Critical error: %s, exit!" % e, extra=self.d) self.circle.exit(0) # should abort else: log.error("OSError({0}):{1}, skipping {2}".format(e.errno, e.strerror, k), extra=self.d) else: if fd > 0: d[k] = fd finally: return fd @staticmethod def do_mkdir(work): src = work.src dest = work.dest if not os.path.exists(dest): os.makedirs(dest) def do_copy(self, work): src = work.src dest = work.dest basedir = os.path.dirname(dest) if not os.path.exists(basedir): os.makedirs(basedir) rfd = self.do_open(src, self.rfd_cache, os.O_RDONLY, self._read_cache_limit) if rfd < 0: return False wfd = self.do_open(dest, self.wfd_cache, os.O_WRONLY | os.O_CREAT, self._write_cache_limit) if wfd < 0: if args.force: try: os.unlink(dest) except OSError as e: log.error("Failed to unlink %s, %s " % (dest, e), extra=self.d) return False else: wfd = self.do_open(dest, self.wfd_cache, os.O_WRONLY, self._write_cache_limit) else: log.error("Failed to create output file %s" % dest, extra=self.d) return False # do the actual copy self.write_bytes(rfd, wfd, work) # update tally self.cnt_filesize += work.length if G.verbosity > 2: log.debug("Transferred %s bytes from:\n\t [%s] to [%s]" % (self.cnt_filesize, src, dest), extra=self.d) return True def do_no_interrupt_checkpoint(self): a = Thread(target=self.do_checkpoint) a.start() a.join() log.debug("checkpoint: %s" % self.checkpoint_file, extra=self.d) print("\nMake checkpoint files: ", self.checkpoint_file) def do_checkpoint(self): # when make checkpoint, first write workq and workq_buf into checkpoint file, then make a copy of workq_db if it exists for k in self.wfd_cache.keys(): os.close(self.wfd_cache[k]) # clear the cache self.wfd_cache.clear() tmp_file = self.checkpoint_file + ".part" with open(tmp_file, "wb") as f: self.circle.workq.extend(self.circle.workq_buf) self.circle.workq_buf.clear() cobj = Checkpoint(self.src, self.dest, self.get_workq(), self.totalsize) pickle.dump(cobj, f, pickle.HIGHEST_PROTOCOL) # POSIX requires rename to be atomic os.rename(tmp_file, self.checkpoint_file) # copy workq_db database file if hasattr(self.circle, "workq_db") and len(self.circle.workq_db) > 0: self.checkpoint_db = self.checkpoint_file + ".db" if not G.resume: shutil.copy2(self.circle.dbname, self.checkpoint_db) else: # in resume mode, make a copy of current workq db file, which is provided checkpoint db file self.workdir = os.getcwd() existingCheckpoint = os.path.join(self.workdir,".pcp_workq.%s.%s.db" % (G.rid, self.circle.rank)) shutil.copy2(existingCheckpoint,self.checkpoint_db) def process(self): """ The only work is "copy" TODO: clean up other actions such as mkdir/fini_check """ if not G.use_store: curtime = MPI.Wtime() if curtime - self.checkpoint_last > self.checkpoint_interval: self.do_no_interrupt_checkpoint() log.info("Checkpointing done ...", extra=self.d) self.checkpoint_last = curtime work = self.deq() self.reduce_items += 1 if isinstance(work, FileChunk): self.do_copy(work) else: log.warn("Unknown work object: %s" % work, extra=self.d) err_and_exit("Not a correct workq format") def reduce_init(self, buf): buf['cnt_filesize'] = self.cnt_filesize if sys.platform == 'darwin': buf['mem_snapshot'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss else: buf['mem_snapshot'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024 def reduce(self, buf1, buf2): buf1['cnt_filesize'] += buf2['cnt_filesize'] buf1['mem_snapshot'] += buf2['mem_snapshot'] return buf1 def reduce_report(self, buf): out = "" if self.totalsize != 0: out += "%.2f %% finished, " % (100 * float(buf['cnt_filesize']) / self.totalsize) out += "%s copied" % bytes_fmt(buf['cnt_filesize']) if self.circle.reduce_time_interval != 0: rate = float(buf['cnt_filesize'] - self.cnt_filesize_prior) / self.circle.reduce_time_interval self.cnt_filesize_prior = buf['cnt_filesize'] out += ", estimated transfer rate: %s/s" % bytes_fmt(rate) out += ", memory usage: %s" % bytes_fmt(buf['mem_snapshot']) print(out) def reduce_finish(self, buf): # self.reduce_report(buf) pass def epilogue(self): global taskloads self.wtime_ended = MPI.Wtime() taskloads = self.circle.comm.gather(self.reduce_items) if self.circle.rank == 0: if self.totalsize == 0: print("\nZero filesize detected, done.\n") return tlapse = self.wtime_ended - self.wtime_started rate = float(self.totalsize) / tlapse print("\nFCP Epilogue:\n") print("\t{:<20}{:<20}".format("Ending at:", utils.current_time())) print("\t{:<20}{:<20}".format("Completed in:", utils.conv_time(tlapse))) print("\t{:<20}{:<20}".format("Transfer Rate:", "%s/s" % bytes_fmt(rate))) print("\t{:<20}{:<20}".format("Use store chunksums:", "%s" % self.use_store)) print("\t{:<20}{:<20}".format("Use store workq:", "%s" % self.circle.use_store)) print("\t{:<20}{:<20}".format("FCP Loads:", "%s" % taskloads)) def read_then_write(self, rfd, wfd, work, num_of_bytes, m): """ core entry point for copy action: first read then write. @param num_of_bytes: the exact amount of bytes we will copy @return: False if unsuccessful. """ buf = None try: buf = readn(rfd, num_of_bytes) except IOError: self.logger.error("Failed to read %s", work.src, extra=self.d) return False try: writen(wfd, buf) except IOError: self.logger.error("Failed to write %s", work.dest, extra=self.d) return False if m: m.update(buf) return True def write_bytes(self, rfd, wfd, work): os.lseek(rfd, work.offset, os.SEEK_SET) os.lseek(wfd, work.offset, os.SEEK_SET) m = None if self.verify: m = hashlib.sha1() remaining = work.length while remaining != 0: if remaining >= self.blocksize: self.read_then_write(rfd, wfd, work, self.blocksize, m) remaining -= self.blocksize else: self.read_then_write(rfd, wfd, work, remaining, m) remaining = 0 if self.verify: # use src path here ck = ChunkSum(work.dest, offset=work.offset, length=work.length, digest=m.hexdigest()) if len(self.chunksums_mem) < G.memitem_threshold: self.chunksums_mem.append(ck) else: self.chunksums_buf.append(ck) if len(self.chunksums_buf) == G.DB_BUFSIZE: if self.use_store == False: self.workdir = os.getcwd() self.chunksums_dbname = "%s/chunksums.%s" % (G.tempdir, self.circle.rank) self.chunksums_db = DbStore(dbname=self.chunksums_dbname) self.use_store = True self.chunksums_db.mput(self.chunksums_buf) del self.chunksums_buf[:]