def test_has_key(self): for size in SIZES: l = LRU(size) for i in range(2*size): l[i] = str(i) self.assertTrue(l.has_key(i)) for i in range(size, 2*size): self.assertTrue(l.has_key(i)) for i in range(size): self.assertFalse(l.has_key(i))
def test_has_key(self): for size in SIZES: l = LRU(size) for i in xrange(2*size): l[i] = str(i) self.assertTrue(l.has_key(i)) for i in xrange(size, 2*size): self.assertTrue(l.has_key(i)) for i in xrange(size): self.assertFalse(l.has_key(i))
class topic4: def __init__(self, c_hash, c_user, c_words): self.topic_count =1 self.l1 = LRU(c_hash) self.l2 = LRU(c_user) def set_hashLRU(self,l): self.set(self.l1, l) def set_userLRU(self,l): self.set(self.l2, l) def set(self, lru, l): for k in l: v = lru.get(k,0) lru[k]=v+1 def set_cluster(self, hashtags, users, words): for k in hashtags: self.l1[k]=self.l1.get(k,0)+1 for k in users: self.l2[k]=self.l2.get(k,0)+1 self.topic_count+=1 def get_similarity(self,hashtags,users,words): h_sum = 1 u_sum = 1 w_sum = 1 h_match =0 h_ind =0 u_ind =0 w_ind =0 c=0 h1 = self.l1.get_size() u1 = self.l2.get_size() for h in hashtags: # l1_items=zip(*self.l1.items()) h_sum+= self.l1.get(h,0) if(self.l1.has_key(h)): ind = self.l1.keys().index(h) h_ind+= h1 - ind h_match+= 1 if ind<250 else 0 for u in users: u_sum+= self.l2.get(u,0) if(self.l2.has_key(u)): u_ind+= u1 - self.l2.keys().index(u) if(h_match !=0): c = h_match -1 # print(h_ind,h1,u_ind,u1,w_ind,w1, h_sum,w_sum,) similarity = (h_ind/(h1+1))*(h_sum/sum(self.l1.values() +[1])) + (u_ind/(u1+1))*(u_sum/sum(self.l2.values()+[1])) +c return similarity
class Newton: ''' area_ids - np.array con ides de las areas. serialized_forests - str path a carpeta con forests serializados. serialized_tree - str path a carpeta con tree serializado data_dir - str path a carpeta con datos. n_forest_results - int cantidad de resultados obtenidos por el RF k - int cantidad de vecinos cercanos calculados por el BallTree ''' def __init__(self, area_ids, serialized_forests, serialized_tree, data_dir, cache=4, n_forest_results=3, k=5): self.area_ids = area_ids self.balltree = Tree(serialized_tree, data_dir) self.n_forest_results = n_forest_results self.k = k self.serialized_forests = serialized_forests self.cache = cache self.locks = {i: Lock() for i in area_ids} self.counters = {i: 0 for i in area_ids} self.active_forests = LRU(cache, callback=lambda key, value: clear(key, value, self.locks,self.counters)) ''' area_id - int id de area para recomendar scores - np.array (n,5) arreglo de puntajes para recomendar retorna np.array (n,n_forest_results,k) carreras recomendadas ''' def get_recs(self, area_id, scores): prediction = self.predict(area_id, scores, self.n_forest_results) recommendations = [] for carreer_set in prediction: recommendations.append(self.balltree.query(carreer_set, self.k)) return np.array(recommendations) def predict(self, area_id, scores, n_results): with self.locks[area_id]: self.counters[area_id] += 1 if not self.active_forests.has_key(area_id): if get_mem_percentage() < 0.3: clear(self.active_forests.peek_last_item()[0], self.active_forests[self.active_forests.peek_last_item()[0]], self.locks,self.counters) self.active_forests[area_id] = Forest(area_id, self.serialized_forests) # print(get_mem_percentage()) forest = self.active_forests[area_id] prediction = forest.get_class(forest.query(scores, n_results)) with self.locks[area_id]: self.counters[area_id] -= 1 # print(self.active_forests.items()) return prediction def filter_recs(self, user, carreers): pass
class complex_cache: def __init__(self, size, type): # the number of items self.size = size # actual size of the cache self.lru = LRU(size) self.hits = 0.0 self.reqs = 0.0 self.cache_stack_size = 0 # how much of the cache is occupied def place(self, request): # request is a tuple (timestamp, username) self.reqs += 1 if self.lru.has_key(request[-1]): self.lru[request[-1]] = self.lru[request[-1]] + 1 self.hits += 1 else: if self.cache_stack_size + 1 > self.size: print "evict an item: "+str(self.lru.peek_last_item()) self.cache_stack_size -= 1 self.lru[request[-1]] = 1 self.cache_stack_size += 1
class StarboardEntries: """A way of managing starboard entries. Sort of like an ORM, but also not fully.""" _pool: asyncpg.Pool = attr.ib() # note: entry cache isn't really a dict, but for typehinting purposes this works _entry_cache: typing.Dict[int, StarboardEntry] = attr.ib() _sql_loop_task: asyncio.Task = attr.ib() _sql_queries: cclass.SetUpdateAsyncQueue = attr.ib() def __init__(self, pool: asyncpg.Pool, cache_size: int = 200): self._pool = pool self._entry_cache = LRU( cache_size ) # the 200 should be raised as the bot grows bigger self._sql_queries = cclass.SetUpdateAsyncQueue() loop = asyncio.get_event_loop() self._sql_loop_task = loop.create_task(self._sql_loop()) def stop(self): """Stops the SQL task loop.""" self._sql_loop_task.cancel() async def _sql_loop(self): """Actually runs SQL updating, hopefully one after another. Saves speed on adding, deleting, and updating by offloading this step here.""" try: while True: entry = await self._sql_queries.get() logging.getLogger("discord").debug(f"Running {entry.query}.") await self._pool.execute(entry.query, timeout=60, *entry.args) self._sql_queries.task_done() except asyncio.CancelledError: pass def _get_required_from_entry(self, entry: StarboardEntry): """Transforms data into the form needed for databases.""" return ( entry.ori_mes_id, entry.ori_chan_id, entry.star_var_id, entry.starboard_id, entry.author_id, list(entry.ori_reactors), list(entry.var_reactors), entry.guild_id, entry.forced, entry.frozen, entry.trashed, ) def _str_builder_to_insert( self, str_builder: typing.List[str], entry: StarboardEntry ): """Takes data from a string builder list and eventually puts the data needed into the _sql_queries variable.""" query = "".join(str_builder) args = self._get_required_from_entry(entry) self._sql_queries.put_nowait(StarboardSQLEntry(query, args)) def _handle_upsert(self, entry: StarboardEntry): """Upserts an entry by using an INSERT with an ON CONFLICT cause. This is a PostgreSQL-specific feature, so that's nice!""" str_builder = [ "INSERT INTO starboard(ori_mes_id, ori_chan_id, star_var_id, ", "starboard_id, author_id, ori_reactors, var_reactors, ", "guild_id, forced, frozen, trashed) VALUES($1, $2, $3, $4, ", "$5, $6, $7, $8, $9, $10, $11) ON CONFLICT (ori_mes_id) DO UPDATE ", "SET ori_chan_id = $2, star_var_id = $3, starboard_id = $4, ", "author_id = $5, ori_reactors = $6, var_reactors = $7, guild_id = $8, ", "forced = $9, frozen = $10, trashed = $11", ] self._str_builder_to_insert(str_builder, entry) def upsert(self, entry: StarboardEntry): """Either adds or updates an entry in the collection of entries.""" temp_dict = {entry.ori_mes_id: entry} if entry.star_var_id: temp_dict[entry.star_var_id] = entry self._entry_cache.update(**temp_dict) # type: ignore this is valid i promise self._handle_upsert(entry) def delete(self, entry_id: int): """Removes an entry from the collection of entries.""" self._entry_cache.pop(entry_id, None) self._sql_queries.put_nowait( StarboardSQLEntry("DELETE FROM starboard WHERE ori_mes_id = $1", [entry_id]) ) async def get( self, entry_id: int, check_for_var: bool = False ) -> typing.Optional[StarboardEntry]: """Gets an entry from the collection of entries.""" entry = None if self._entry_cache.has_key(entry_id): # type: ignore entry = self._entry_cache[entry_id] else: entry = discord.utils.find( lambda e: e and e.star_var_id == entry_id, self._entry_cache.values() ) if not entry: async with self._pool.acquire() as conn: data = await conn.fetchrow( f"SELECT * FROM starboard WHERE ori_mes_id = {entry_id} OR" f" star_var_id = {entry_id}" ) if data: entry = StarboardEntry.from_row(data) self._entry_cache[entry_id] = entry if entry and check_for_var and not entry.star_var_id: return None return entry async def select_query(self, query: str): """Selects the starboard database directly for entries based on the query.""" async with self._pool.acquire() as conn: data = await conn.fetch(f"SELECT * FROM starboard WHERE {query}") if not data: return None return tuple(StarboardEntry.from_row(row) for row in data) async def raw_query(self, query: str): """Runs the raw query against the pool, assuming the results are starboard entries.""" async with self._pool.acquire() as conn: data = await conn.fetch(query) if not data: return None return tuple(StarboardEntry.from_row(row) for row in data) async def super_raw_query(self, query: str): """You want a raw query? You'll get one.""" async with self._pool.acquire() as conn: return await conn.fetch(query) async def query_entries( self, seperator: str = "AND", **conditions: typing.Dict[str, str] ) -> typing.Optional[typing.Tuple[StarboardEntry, ...]]: """Queries entries based on conditions provided. For example, you could do `query_entries(guild_id=143425)` to get entries with that guild id.""" sql_conditions: list[str] = [ f"{key} = {value}" for key, value in conditions.items() ] combined_statements = f" {seperator} ".join(sql_conditions) async with self._pool.acquire() as conn: data = await conn.fetch( f"SELECT * FROM starboard WHERE {combined_statements}" ) if not data: return None return tuple(StarboardEntry.from_row(row) for row in data) async def get_random(self, guild_id: int) -> typing.Optional[StarboardEntry]: """Gets a random entry from a guild.""" # query adapted from # https://github.com/Rapptz/RoboDanny/blob/1fb95d76d1b7685e2e2ff950e11cddfc96efbfec/cogs/stars.py#L1082 query = """SELECT * FROM starboard WHERE guild_id=$1 AND star_var_id IS NOT NULL OFFSET FLOOR(RANDOM() * ( SELECT COUNT(*) FROM starboard WHERE guild_id=$1 AND star_var_id IS NOT NULL )) LIMIT 1 """ async with self._pool.acquire() as conn: data = await conn.fetchrow(query, guild_id) if not data: return None return StarboardEntry.from_row(data)
class FileServer(fileService_pb2_grpc.FileserviceServicer): def __init__(self, hostname, server_port, activeNodesChecker, shardingHandler, superNodeAddress): self.serverPort = server_port self.serverAddress = hostname + ":" + server_port self.activeNodesChecker = activeNodesChecker self.shardingHandler = shardingHandler self.hostname = hostname self.lru = LRU(5) self.superNodeAddress = superNodeAddress # # This service gets invoked when user uploads a new file. # def UploadFile(self, request_iterator, context): print("Inside Server method ---------- UploadFile") data = bytes("", 'utf-8') username, filename = "", "" totalDataSize = 0 active_ip_channel_dict = self.activeNodesChecker.getActiveChannels() # list to store the info related to file location. metaData = [] # If the node is the leader of the cluster. if (int(db.get("primaryStatus")) == 1): print("Inside primary upload") currDataSize = 0 currDataBytes = bytes("", 'utf-8') seqNo = 1 # Step 1: # Get 2 least loaded nodes based on the CPU stats. # 'Node' is where the actual data goes and 'node_replica' is where replica will go. node, node_replica = self.getLeastLoadedNode() if (node == -1): return fileService_pb2.ack( success=False, message="Error Saving File. No active nodes.") # Step 2: # Check whether file already exists, if yes then return with message 'File already exists'. for request in request_iterator: username, filename = request.username, request.filename print("Key is-----------------", username + "_" + filename) if (self.fileExists(username, filename) == 1): print("sending neg ack") return fileService_pb2.ack( success=False, message= "File already exists for this user. Please rename or delete file first." ) break # Step 3: # Make chunks of size 'UPLOAD_SHARD_SIZE' and start sending the data to the least utilized node trough gRPC streaming. currDataSize += sys.getsizeof(request.data) currDataBytes += request.data for request in request_iterator: if ((currDataSize + sys.getsizeof(request.data)) > UPLOAD_SHARD_SIZE): response = self.sendDataToDestination( currDataBytes, node, node_replica, username, filename, seqNo, active_ip_channel_dict[node]) metaData.append([node, seqNo, node_replica]) currDataBytes = request.data currDataSize = sys.getsizeof(request.data) seqNo += 1 node, node_replica = self.getLeastLoadedNode() else: currDataSize += sys.getsizeof(request.data) currDataBytes += request.data if (currDataSize > 0): response = self.sendDataToDestination( currDataBytes, node, node_replica, username, filename, seqNo, active_ip_channel_dict[node]) metaData.append([node, seqNo, node_replica]) # Step 4: # Save the metadata on the primary node after the completion of sharding. if (response.success): db.saveMetaData(username, filename, metaData) db.saveUserFile(username, filename) # Step 5: # Make a gRPC call to replicate the matadata on all the other nodes. self.saveMetadataOnAllNodes(username, filename, metaData) return fileService_pb2.ack(success=True, message="Saved") # If the node is not the leader. else: print("Saving the data on my local db") sequenceNumberOfChunk = 0 dataToBeSaved = bytes("", 'utf-8') # Gather all the data from gRPC stream for request in request_iterator: username, filename, sequenceNumberOfChunk = request.username, request.filename, request.seqNo dataToBeSaved += request.data key = username + "_" + filename + "_" + str(sequenceNumberOfChunk) # Save the data in local DB. db.setData(key, dataToBeSaved) # After saving the chunk in the local DB, make a gRPC call to save the replica of the chunk on different # node only if the replicaNode is present. if (request.replicaNode != ""): print("Sending replication to ", request.replicaNode) replica_channel = active_ip_channel_dict[request.replicaNode] t1 = Thread(target=self.replicateChunkData, args=( replica_channel, dataToBeSaved, username, filename, sequenceNumberOfChunk, )) t1.start() # stub = fileService_pb2_grpc.FileserviceStub(replica_channel) # response = stub.UploadFile(self.sendDataInStream(dataToBeSaved, username, filename, sequenceNumberOfChunk, "")) return fileService_pb2.ack(success=True, message="Saved") def replicateChunkData(self, replica_channel, dataToBeSaved, username, filename, sequenceNumberOfChunk): stub = fileService_pb2_grpc.FileserviceStub(replica_channel) response = stub.UploadFile( self.sendDataInStream(dataToBeSaved, username, filename, sequenceNumberOfChunk, "")) # This helper method is responsible for sending the data to destination node through gRPC stream. def sendDataToDestination(self, currDataBytes, node, nodeReplica, username, filename, seqNo, channel): if (node == self.serverAddress): key = username + "_" + filename + "_" + str(seqNo) db.setData(key, currDataBytes) if (nodeReplica != ""): print("Sending replication to ", nodeReplica) active_ip_channel_dict = self.activeNodesChecker.getActiveChannels( ) replica_channel = active_ip_channel_dict[nodeReplica] stub = fileService_pb2_grpc.FileserviceStub(replica_channel) response = stub.UploadFile( self.sendDataInStream(currDataBytes, username, filename, seqNo, "")) return response else: print("Sending the UPLOAD_SHARD_SIZE to node :", node) stub = fileService_pb2_grpc.FileserviceStub(channel) response = stub.UploadFile( self.sendDataInStream(currDataBytes, username, filename, seqNo, nodeReplica)) print("Response from uploadFile: ", response.message) return response # This helper method actually makes chunks of less than 4MB and streams them through gRPC. # 4 MB is the max data packet size in gRPC while sending. That's why it is necessary. def sendDataInStream(self, dataBytes, username, filename, seqNo, replicaNode): chunk_size = 4000000 start, end = 0, chunk_size while (True): chunk = dataBytes[start:end] if (len(chunk) == 0): break start = end end += chunk_size yield fileService_pb2.FileData(username=username, filename=filename, data=chunk, seqNo=seqNo, replicaNode=replicaNode) # # This service gets invoked when user requests an uploaded file. # def DownloadFile(self, request, context): print("Inside Download") # If the node is the leader of the cluster. if (int(db.get("primaryStatus")) == 1): print("Inside primary download") # Check if file exists if (self.fileExists(request.username, request.filename) == 0): print("File does not exist") yield fileService_pb2.FileData(username=request.username, filename=request.filename, data=bytes("", 'utf-8'), seqNo=0) return # If the file is present in cache then just fetch it and return. No need to go to individual node. if (self.lru.has_key(request.username + "_" + request.filename)): print("Fetching data from Cache") CHUNK_SIZE = 4000000 fileName = request.username + "_" + request.filename filePath = self.lru[fileName] outfile = os.path.join(filePath, fileName) with open(outfile, 'rb') as infile: while True: chunk = infile.read(CHUNK_SIZE) if not chunk: break yield fileService_pb2.FileData( username=request.username, filename=request.filename, data=chunk, seqNo=1) # If the file is not present in the cache, then fetch it from the individual node. else: print("Fetching the metadata") # Step 1: get metadata i.e. the location of chunks. metaData = db.parseMetaData(request.username, request.filename) print(metaData) #Step 2: make gRPC calls and get the fileData from all the nodes. downloadHelper = DownloadHelper(self.hostname, self.serverPort, self.activeNodesChecker) data = downloadHelper.getDataFromNodes(request.username, request.filename, metaData) print("Sending the data to client") #Step 3: send the file to supernode using gRPC streaming. chunk_size = 4000000 start, end = 0, chunk_size while (True): chunk = data[start:end] if (len(chunk) == 0): break start = end end += chunk_size yield fileService_pb2.FileData(username=request.username, filename=request.filename, data=chunk, seqNo=request.seqNo) # Step 4: update the cache based on LRU(least recently used) algorithm. self.saveInCache(request.username, request.filename, data) # If the node is not the leader, then just fetch the fileChunk from the local db and stream it back to leader. else: key = request.username + "_" + request.filename + "_" + str( request.seqNo) print(key) data = db.getFileData(key) chunk_size = 4000000 start, end = 0, chunk_size while (True): chunk = data[start:end] if (len(chunk) == 0): break start = end end += chunk_size yield fileService_pb2.FileData(username=request.username, filename=request.filename, data=chunk, seqNo=request.seqNo) # This service is responsible fetching all the files. def FileList(self, request, context): print("File List Called") userFiles = db.getUserFiles(request.username) return fileService_pb2.FileListResponse(Filenames=str(userFiles)) # This helper method checks whether the file is present in db or not. def fileExists(self, username, filename): print("isFile Present", db.keyExists(username + "_" + filename)) return db.keyExists(username + "_" + filename) # This helper method returns 2 least loaded nodes from the cluster. def getLeastLoadedNode(self): print("Ready to enter sharding handler") node, node_replica = self.shardingHandler.leastUtilizedNode() print("Least loaded node is :", node) print("Replica node - ", node_replica) return node, node_replica # This helper method replicates the metadata on all nodes. def saveMetadataOnAllNodes(self, username, filename, metadata): print("saveMetadataOnAllNodes") active_ip_channel_dict = self.activeNodesChecker.getActiveChannels() uniqueFileName = username + "_" + filename for ip, channel in active_ip_channel_dict.items(): if (self.isChannelAlive(channel)): stub = fileService_pb2_grpc.FileserviceStub(channel) response = stub.MetaDataInfo( fileService_pb2.MetaData( filename=uniqueFileName, seqValues=str(metadata).encode('utf-8'))) print(response.message) # This service is responsible for saving the metadata on local db. def MetaDataInfo(self, request, context): print("Inside Metadatainfo") fileName = request.filename seqValues = request.seqValues db.saveMetaDataOnOtherNodes(fileName, seqValues) ack_message = "Successfully saved the metadata on " + self.serverAddress return fileService_pb2.ack(success=True, message=ack_message) # This helper method checks whethere created channel is alive or not def isChannelAlive(self, channel): try: grpc.channel_ready_future(channel).result(timeout=1) except grpc.FutureTimeoutError: #print("Connection timeout. Unable to connect to port ") return False return True # This helper method is responsible for updating the cache for faster lookup. def saveInCache(self, username, filename, data): if (len(self.lru.items()) >= self.lru.get_size()): fileToDel, path = self.lru.peek_last_item() os.remove(path + "/" + fileToDel) self.lru[username + "_" + filename] = "cache" filePath = os.path.join('cache', username + "_" + filename) saveFile = open(filePath, 'wb') saveFile.write(data) saveFile.close() # This service is responsible for sending the whole cluster stats to superNode def getClusterStats(self, request, context): print("Inside getClusterStats") active_ip_channel_dict = self.activeNodesChecker.getActiveChannels() total_cpu_usage, total_disk_space, total_used_mem = 0.0, 0.0, 0.0 total_nodes = 0 for ip, channel in active_ip_channel_dict.items(): if (self.isChannelAlive(channel)): stub = heartbeat_pb2_grpc.HearBeatStub(channel) stats = stub.isAlive(heartbeat_pb2.NodeInfo(ip="", port="")) total_cpu_usage = float(stats.cpu_usage) total_disk_space = float(stats.disk_space) total_used_mem = float(stats.used_mem) total_nodes += 1 if (total_nodes == 0): return fileService_pb2.ClusterStats(cpu_usage=str(100.00), disk_space=str(100.00), used_mem=str(100.00)) return fileService_pb2.ClusterStats( cpu_usage=str(total_cpu_usage / total_nodes), disk_space=str(total_disk_space / total_nodes), used_mem=str(total_used_mem / total_nodes)) # This service is responsible for sending the leader info to superNode as soon as leader changes. def getLeaderInfo(self, request, context): channel = grpc.insecure_channel('{}'.format(self.superNodeAddress)) stub = fileService_pb2_grpc.FileserviceStub(channel) response = stub.getLeaderInfo( fileService_pb2.ClusterInfo(ip=self.hostname, port=self.serverPort, clusterName="team1")) print(response.message) # # This service gets invoked when user deletes a file. # def FileDelete(self, request, data): username = request.username filename = request.filename if (int(db.get("primaryStatus")) == 1): if (self.fileExists(username, filename) == 0): print("File does not exist") return fileService_pb2.ack(success=False, message="File does not exist") print("Fetching metadata from leader") metadata = db.parseMetaData(request.username, request.filename) print("Successfully retrieved metadata from leader") deleteHelper = DeleteHelper(self.hostname, self.serverPort, self.activeNodesChecker) deleteHelper.deleteFileChunksAndMetaFromNodes( username, filename, metadata) return fileService_pb2.ack( success=True, message="Successfully deleted file from the cluster") else: seqNo = -1 try: seqNo = request.seqNo except: return fileService_pb2.ack(success=False, message="Internal Error") metaDataKey = username + "_" + filename dataChunkKey = username + "_" + filename + "_" + str(seqNo) if (db.keyExists(metaDataKey) == 1): print("FileDelete: Deleting the metadataEntry from local db :") db.deleteEntry(metaDataKey) if (db.keyExists(dataChunkKey)): print("FileDelete: Deleting the data chunk from local db: ") db.deleteEntry(dataChunkKey) return fileService_pb2.ack( success=True, message="Successfully deleted file from the cluster") # # This service gets invoked when user wants to check if the file is present. # def FileSearch(self, request, data): username, filename = request.username, request.filename if (self.fileExists(username, filename) == 1): return fileService_pb2.ack(success=True, message="File exists in the cluster.") else: return fileService_pb2.ack( success=False, message="File does not exist in the cluster.") # # This service gets invoked when user wants to update a file. # def UpdateFile(self, request_iterator, context): username, filename = "", "" fileData = bytes("", 'utf-8') for request in request_iterator: fileData += request.data username, filename = request.username, request.filename def getFileChunks(fileData): # Maximum chunk size that can be sent CHUNK_SIZE = 4000000 outfile = os.path.join('files', fileName) sTime = time.time() while True: chunk = fileData.read(CHUNK_SIZE) if not chunk: break yield fileService_pb2.FileData(username=username, filename=fileName, data=chunk, seqNo=1) print("Time for upload= ", time.time() - sTime) if (int(db.get("primaryStatus")) == 1): channel = grpc.insecure_channel('{}'.format(self.serverAddress)) stub = fileService_pb2_grpc.FileserviceStub(channel) response1 = stub.FileDelete( fileService_pb2.FileInfo(username=userName, filename=fileName)) if (response1.success): response2 = stub.UploadFile(getFileChunks(fileData)) if (response2.success): return fileService_pb2.ack( success=True, message="File suceessfully updated.") else: return fileService_pb2.ack(success=False, message="Internal error.") else: return fileService_pb2.ack(success=False, message="Internal error.")
class topic4: def __init__(self, c_hash, c_user, c_words): self.topic_count =1 # self.time = (self.first,self.last) self.l1 = LRU(c_hash) self.first ="" self.last="" self.lats=[] self.longs=[] self.l2 = LRU(c_user) self.l3 = LRU(c_words) self.l4 = LRU(400) def set_hashLRU(self,l): self.set(self.l1, l) def set_userLRU(self,l): self.set(self.l2, l) def set_wordLRU(self,l): self.set(self.l3, l) def set(self, lru, l): for k in l: v = lru.get(k,0) lru[k]=v+1 def set_cluster(self, hashtags, users, words,links, cords): for k in hashtags: self.l1[k]=self.l1.get(k,0)+1 for k in users: self.l2[k]=self.l2.get(k,0)+1 for k in words: self.l3[k]=self.l3.get(k,0)+1 for k in links: self.l4[k]=self.l4.get(k,0)+1 if(cords is not None): self.lats.append(cords["coordinates"][1]) self.longs.append(cords["coordinates"][0]) self.topic_count+=1 def get_similarity(self,hashtags,users,words): h_sum = 1 u_sum = 1 w_sum = 1 h_match =0 h_ind =0 u_ind =0 w_ind =0 c=0 h1 = self.l1.get_size() u1 = self.l2.get_size() w1 = self.l3.get_size() for h in hashtags: # l1_items=zip(*self.l1.items()) h_sum+= self.l1.get(h,0) if(self.l1.has_key(h)): ind = self.l1.keys().index(h) h_ind+= h1 - ind h_match+= 1 if ind<250 else 0 for u in users: u_sum+= self.l2.get(u,0) if(self.l2.has_key(u)): u_ind+= u1 - self.l2.keys().index(u) for w in words: w_sum+= self.l3.get(w,0) if(self.l3.has_key(w)): w_ind+= w1 - self.l3.keys().index(w) if(h_match !=0): c = h_match -1 # print(h_ind,h1,u_ind,u1,w_ind,w1, h_sum,w_sum,) similarity = (h_ind/(h1+1))*(h_sum/sum(self.l1.values() +[1])) + (u_ind/(u1+1))*(u_sum/sum(self.l2.values()+[1])) + (w_ind/(w1+1))*(w_sum/sum(self.l3.values()+[1])) +c return similarity def flush1(self, cache, size): if(len(cache.keys())>5): tokens = reversed(cache.keys()[5]) cache.clear() for i in tokens: cache[i]=1 def flush(self): self.flush1(self.l1,500) self.flush1(self.l2, 500) self.flush1(self.l3,3500) self.topic_count=1
l.keys() # Can get keys alone in MRU order # Would print [3, 5, 4, 2, 1] del l[4] # Delete an item print(l.items()) # Would print [(3, '3'), (5, '5'), (2, '2'), (1, '1')] print(l.get_size()) # Would print 5 l.set_size(3) print(l.items()) # Would print [(3, '3'), (5, '5'), (2, '2')] print(l.get_size()) # Would print 3 print(l.has_key(5)) # Would print True print(2 in l) # Would print True l.get_stats() # Would print (1, 0) l.update(5='0') # Update an item print l.items() # Would print [(5, '0'), (3, '3'), (2, '2')] l.clear() print l.items() # Would print []
class Cache: """Class representing D3N.""" # Replacement policies LRU = "LRU" LFU = "LFU" LRU_S = "LRU_S" FIFO = "FIFO" RAND = "RAND" # Write policies WRITE_BACK = "WB" WRITE_THROUGH = "WT" # Layer L1 = "L1" L2 = "L2" consistent = "consistent" rendezvous = "rendezvous" rr = "rr" def __init__(self, layer, size, replace_pol, write_pol, hash_ring, hash_type, obj_size, full_size, logger): self._replace_pol = replace_pol # Replacement policy self._write_pol = write_pol # Write policy self._layer = layer # Layer info self._size = size # Cache size self.spaceLeft = size # Cache size self._logger = logger self.hashmap = {} # Mapping self.hash_ring = hash_ring self._hash_type = hash_type self._obj_size = obj_size if (self._size == 0): self.zerosize = True self._size = 1 else: self.zerosize = False if (self._replace_pol == Cache.LRU): self.cache = LRU(self._size) elif (self._replace_pol == Cache.FIFO): self.cache = deque() elif (self._replace_pol == Cache.LRU_S): self.cache = LRU(self._size) self.shadow = LRU(full_size) self.hist = [] for i in range(full_size): self.hist.append(0) # Statistics self._hit_count = 0 self._miss_count = 0 self._backend_bw = 0 self._crossrack_bw = 0 self._intrarack_bw = 0 self.miss_lat = 0 self.lat_count = 0 def _insert1(self, key, size): # No eviction if not self.zerosize: if (self._replace_pol == Cache.LRU_S): self.shadow[key] = 1 if (int(size) <= self.spaceLeft): if (self._replace_pol == Cache.LRU): self.cache[key] = int(size) elif (self._replace_pol == Cache.LRU_S): self.cache[key] = int(size) elif (self._replace_pol == Cache.FIFO): self.cache.append(key) self.hashmap[key] = int(size) self.spaceLeft -= int(size) else: while (int(size) > self.spaceLeft): self._evict() if (self._replace_pol == Cache.LRU): self.cache[key] = int(size) elif (self._replace_pol == Cache.LRU_S): self.cache[key] = int(size) elif (self._replace_pol == Cache.FIFO): self.cache.append(key) self.hashmap[key] = int(size) self.spaceLeft -= int(size) def _insert(self, key, size): # No eviction if not self.zerosize: if (self._replace_pol == Cache.LRU_S): self.cache[key] = int(size) self.shadow[key] = int(size) elif (self._replace_pol == Cache.LRU): self.cache[key] = int(size) else: if (int(size) <= self.spaceLeft): if (self._replace_pol == Cache.LRU): self.cache[key] = int(size) elif (self._replace_pol == Cache.LRU_S): self.cache[key] = int(size) elif (self._replace_pol == Cache.FIFO): self.cache.append(key) self.hashmap[key] = int(size) self.spaceLeft -= int(size) else: while (int(size) > self.spaceLeft): self._evict() if (self._replace_pol == Cache.LRU): self.cache[key] = int(size) elif (self._replace_pol == Cache.LRU_S): self.cache[key] = int(size) elif (self._replace_pol == Cache.FIFO): self.cache.append(key) self.hashmap[key] = int(size) self.spaceLeft -= int(size) def read1(self, key, size): if self._layer == "BE": return 1 if self.zerosize == True: return None """Read a object from the cache.""" r = None if (self._replace_pol == Cache.LRU_S): if self.shadow.has_key(key): count = 0 for i in self.shadow.keys(): if i == key: self.hist[count] += 1 break count += 1 self.shadow[key] = 1 if key in self.hashmap: if (self._replace_pol == Cache.LRU): self._update_use(key) elif (self._replace_pol == Cache.LRU_S): self._update_use(key) self._hit_count += 1 r = 1 else: self._miss_count += 1 return r def read(self, key, size): if self._layer == "BE": return 1 if self.zerosize == True: return None """Read a object from the cache.""" r = None if (self._replace_pol == Cache.LRU_S): if self.cache.has_key(key): self._hit_count += 1 self.cache[key] = self.cache[key] r = 1 else: self._miss_count += 1 if self.shadow.has_key(key): count = 0 for i in self.shadow.keys(): if i == key: self.hist[count] += 1 break count += 1 self.shadow[key] = 1 else: if key in self.hashmap: if (self._replace_pol == Cache.LRU): self._update_use(key) elif (self._replace_pol == Cache.LRU_S): self._update_use(key) self._hit_count += 1 r = 1 else: self._miss_count += 1 return r def checkKey(self, key): if self._layer == "BE": return 1 if self.zerosize == True: return 0 """Read a object from the cache.""" r = 0 if (self._replace_pol == Cache.LRU_S) or (self._replace_pol == Cache.LRU): if self.cache.has_key(key): r = 1 else: r = 0 return r def _evict(self): if (self._replace_pol == Cache.LRU): id = self.cache.peek_last_item()[0] del self.cache[id] elif (self._replace_pol == Cache.LRU_S): id = self.cache.peek_last_item()[0] del self.cache[id] elif (self._replace_pol == Cache.FIFO): id = self.cache.popleft() self.spaceLeft += int(self.hashmap[id]) del self.hashmap[id] def _update_use(self, key): """Update the use of a cache.""" if (self._replace_pol == Cache.LRU): self.cache[key] = self.hashmap[key] if (self._replace_pol == Cache.LRU_S): self.cache[key] = self.hashmap[key] def set_cache_size(self, size): new_size = self.cache.get_size() + int(size) self.cache.set_size(int(new_size)) def set_backend_bw(self, value): self._backend_bw += value def set_crossrack_bw(self, value): self._crossrack_bw += value def set_intrarack_bw(self, value): self._intrarack_bw += value def get_backend_bw(self): return self._backend_bw def get_crossrack_bw(self): return self._crossrack_bw def get_intrarack_bw(self): return self._intrarack_bw def get_replace_pol(self): return self._replace_pol def get_hit_count(self): return self._hit_count def get_miss_count(self): return self._miss_count def get_available_space(self): return self.spaceLeft def get_replace_poll(self): return self._replace_pol def reset_shadow_cache(): self.shadow.clear() def print_cache(self): print self.cache def get_l2_address(self, key): if (self._hash_type == Cache.consistent): return self.hash_ring.get_node(key) elif (self._hash_type == Cache.rendezvous): return self.hash_ring.find_node(key) elif (self._hash_type == Cache.rr): val = key.split("_")[1] res = int(val) % int(self.hash_ring) return res
class ImpalaLogger(object): def __init__(self, nodes: list, elasticsearch: Elasticsearch = None, lru_size: int = 5000): self.nodes = nodes self.queries_logged = LRU(lru_size) self.elasticsearch = elasticsearch def run(self): """ We need to get from all the nodes the required information. In this case we need to first of all load the queries, and then the metadata on the query profile. :return: """ queries = [] with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: futures = { executor.submit(self.query_retriever, node, 2): node for node in self.nodes } for future in concurrent.futures.as_completed(futures): node = futures[future] try: retrieved_queries = future.result() for query in retrieved_queries: if not self.queries_logged.has_key(query.query_id): self.queries_logged[query.query_id] = True query.node = node queries.append(query) except Exception as e: print("Something went wrong {}".format(e)) with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: futures = { executor.submit(self.query_profiler, query, 2): query for query in queries } for future in concurrent.futures.as_completed(futures): try: query = future.result() # send to elastic self.elasticsearch.index(index='impala_queries', doc_type='query', id=query.query_id, body=query.to_dict()) print("[{}] Query {} indexed".format( query.start_time, query.query_id)) except Exception as e: print('Something went wrong {}'.format(e)) @staticmethod def query_retriever(node: str, timeout: int = 1) -> list: url = "{schema}{ip}/{path}".format(schema='http://', ip=node, path='queries') request = requests.get(url, timeout=timeout) if request.status_code != 200: return False parser = ImpalaQueryLogParser(request.text) queries = parser.queries if not queries: return [] return queries @staticmethod def query_profiler(query: Query, timeout: int = 1) -> Query: url = "{schema}{ip}/query_profile?query_id={query_id}".format( schema='http://', ip=query.node, query_id=query.query_id) request = requests.get(url, timeout=timeout) if request.status_code != 200: return False return ImpalaQueryLogParser(request.text).extract_profile(query)