def pseudonymise(input, output_folder, chunksize, salt, person_id): logger = Logger("pseudonymise") logger.info( f"Working on file {input}, pseudonymising column '{person_id}' with salt '{salt}'" ) #create the dir os.makedirs(output_folder, exist_ok=True) f_out = f"{output_folder}{os.path.sep}{os.path.basename(input)}" logger.info(f"Saving new file to {f_out}") #load data data = coconnect.tools.load_csv(input, chunksize=chunksize) i = 0 while True: data[input][person_id] = data[input][person_id].apply( lambda x: hashlib.sha256((x + salt).encode("UTF-8")).hexdigest()) logger.info(data[input][person_id]) mode = 'w' header = True if i > 0: mode = 'a' header = False data[input].to_csv(f_out, mode=mode, header=header, index=False) log.info( f"Finished {input} of size={len(data[input])} on iteration {i}") i += 1 try: data.next() except StopIteration: break logger.info("Done!") return f_out
class Profiler: def __init__(self,name=None,interval=0.1): if name == None: name = self.__class__.__name__ else: name = f"{self.__class__.__name__}_{name}" self.logger = Logger(self.__class__.__name__) #retrieve the process id for the current run self.pid = os.getpid() #create a psutil instance to montior this self.py = psutil.Process(self.pid) #set the interval (seconds) of how often to check the cpu and memory self.interval = interval self.logger.info(f"tracking {self.pid} every {self.interval} seconds") #count the number of cpus the computer running this process has self.cpu_count = psutil.cpu_count() self.logger.info(f"{self.cpu_count} cpus available") #initiate a threaded function #that will run in a separate process and can monitor CPU/memory in the background self.th = threading.Thread(target=self.track) #init some global variables self.tracking = [] self.init_time = time.time() self._stop = False self._df = None def start(self): #start the thread self.logger.info("starting profiling") self.th.start() def stop(self): #stop the thread self._stop = True self.th.join() self.logger.info("finished profiling") def get_df(self): #build a little dataframe for cpu/memory v.s. time, #if it has not been built already if self._df is None: self._df = pd.DataFrame(self.tracking) return self._df def summary(self): #print the dataframe created for cpu/memory v.s. time self.logger.info(self.get_df()) def track(self): """ Main function to profile CPU and memory usage """ #while the program has been told to profile the usage while self._stop == False: #from the current process, calculate the current memory usage (in GB) memory = self.py.memory_info()[0]/2.**30 #also calculate the CPU % in use at this epoch in time cpu = self.py.cpu_percent() / self.cpu_count #calcuate the current time - time since the start of the process current_time = time.time() - self.init_time #log the data info = {'time[s]':current_time,'memory[GB]':memory,'cpu[%]':cpu} self.tracking.append(info) #sleep the number of seconds requested time.sleep(self.interval) #once finished, call the summary function self.summary()